diff --git a/Assembler/KeplerAs/Install.sh b/Assembler/KeplerAs/Install.sh new file mode 100755 index 0000000..57c8d24 --- /dev/null +++ b/Assembler/KeplerAs/Install.sh @@ -0,0 +1,3 @@ +perl Makefile.PL +make +sudo make install diff --git a/Assembler/KeplerAs/Install_locally.sh b/Assembler/KeplerAs/Install_locally.sh new file mode 100755 index 0000000..79be922 --- /dev/null +++ b/Assembler/KeplerAs/Install_locally.sh @@ -0,0 +1,6 @@ +perl Makefile.PL +make + +#configure the following variables in .bashrc; then source ~/.bashrc +#export PERL5LIB=/home/xiuxia/PP2017_artifact/KeplerAs/blib/lib/:$PERL5LIB +#export PATH=/home/xiuxia/PPoPP2017_artifact/KeplerAs/blib/script:$PATH diff --git a/Assembler/KeplerAs/LICENSE b/Assembler/KeplerAs/LICENSE new file mode 100644 index 0000000..2c9314c --- /dev/null +++ b/Assembler/KeplerAs/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray +Copyright (c) 2015~2016 Xiuxia Zhang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/Assembler/KeplerAs/MYMETA.json b/Assembler/KeplerAs/MYMETA.json new file mode 100644 index 0000000..cb97ff4 --- /dev/null +++ b/Assembler/KeplerAs/MYMETA.json @@ -0,0 +1,42 @@ +{ + "abstract" : "Assembler for NVIDIA Maxwell architecture", + "author" : [ + "Xiuxia Zhang " + ], + "dynamic_config" : 0, + "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001", + "license" : [ + "mit" + ], + "meta-spec" : { + "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", + "version" : "2" + }, + "name" : "KeplerAs-KeplerAs", + "no_index" : { + "directory" : [ + "t", + "inc" + ] + }, + "prereqs" : { + "build" : { + "requires" : { + "ExtUtils::MakeMaker" : "0" + } + }, + "configure" : { + "requires" : { + "ExtUtils::MakeMaker" : "0" + } + }, + "runtime" : { + "requires" : { + "Carp" : "1.29", + "Data::Dumper" : "2.145" + } + } + }, + "release_status" : "stable", + "version" : "1.06" +} diff --git a/Assembler/KeplerAs/MYMETA.yml b/Assembler/KeplerAs/MYMETA.yml new file mode 100644 index 0000000..7a0496d --- /dev/null +++ b/Assembler/KeplerAs/MYMETA.yml @@ -0,0 +1,23 @@ +--- +abstract: 'Assembler for NVIDIA Maxwell architecture' +author: + - 'Xiuxia Zhang ' +build_requires: + ExtUtils::MakeMaker: '0' +configure_requires: + ExtUtils::MakeMaker: '0' +dynamic_config: 0 +generated_by: 'ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001' +license: mit +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: '1.4' +name: KeplerAs-KeplerAs +no_index: + directory: + - t + - inc +requires: + Carp: '1.29' + Data::Dumper: '2.145' +version: '1.06' diff --git a/Assembler/KeplerAs/Makefile b/Assembler/KeplerAs/Makefile new file mode 100644 index 0000000..f6dc8a8 --- /dev/null +++ b/Assembler/KeplerAs/Makefile @@ -0,0 +1,878 @@ +# This Makefile is for the KeplerAs::KeplerAs extension to perl. +# +# It was generated automatically by MakeMaker version +# 7.0401 (Revision: 70401) from the contents of +# Makefile.PL. Don't edit this file, edit Makefile.PL instead. +# +# ANY CHANGES MADE HERE WILL BE LOST! +# +# MakeMaker ARGV: () +# + +# MakeMaker Parameters: + +# ABSTRACT_FROM => q[lib/KeplerAs/KeplerAs.pm] +# AUTHOR => [q[Xiuxia Zhang ]] +# BUILD_REQUIRES => { } +# CONFIGURE_REQUIRES => { } +# EXE_FILES => [q[bin/KeplerAs.pl]] +# LICENSE => q[MIT] +# NAME => q[KeplerAs::KeplerAs] +# PREREQ_PM => { Carp=>q[1.29], Data::Dumper=>q[2.145] } +# TEST_REQUIRES => { } +# VERSION_FROM => q[lib/KeplerAs/KeplerAs.pm] + +# --- MakeMaker post_initialize section: + + +# --- MakeMaker const_config section: + +# These definitions are from config.sh (via /usr/lib/x86_64-linux-gnu/perl/5.22/Config.pm). +# They may have been overridden via Makefile.PL or on the command line. +AR = ar +CC = x86_64-linux-gnu-gcc +CCCDLFLAGS = -fPIC +CCDLFLAGS = -Wl,-E +DLEXT = so +DLSRC = dl_dlopen.xs +EXE_EXT = +FULL_AR = /usr/bin/ar +LD = x86_64-linux-gnu-gcc +LDDLFLAGS = -shared -L/usr/local/lib -fstack-protector-strong +LDFLAGS = -fstack-protector-strong -L/usr/local/lib +LIBC = libc-2.21.so +LIB_EXT = .a +OBJ_EXT = .o +OSNAME = linux +OSVERS = 3.16.0 +RANLIB = : +SITELIBEXP = /usr/local/share/perl/5.22.1 +SITEARCHEXP = /usr/local/lib/x86_64-linux-gnu/perl/5.22.1 +SO = so +VENDORARCHEXP = /usr/lib/x86_64-linux-gnu/perl5/5.22 +VENDORLIBEXP = /usr/share/perl5 + + +# --- MakeMaker constants section: +AR_STATIC_ARGS = cr +DIRFILESEP = / +DFSEP = $(DIRFILESEP) +NAME = KeplerAs::KeplerAs +NAME_SYM = KeplerAs_KeplerAs +VERSION = 1.06 +VERSION_MACRO = VERSION +VERSION_SYM = 1_06 +DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\" +XS_VERSION = 1.06 +XS_VERSION_MACRO = XS_VERSION +XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\" +INST_ARCHLIB = blib/arch +INST_SCRIPT = blib/script +INST_BIN = blib/bin +INST_LIB = blib/lib +INST_MAN1DIR = blib/man1 +INST_MAN3DIR = blib/man3 +MAN1EXT = 1p +MAN3EXT = 3pm +INSTALLDIRS = site +DESTDIR = +PREFIX = $(SITEPREFIX) +PERLPREFIX = /usr +SITEPREFIX = /usr/local +VENDORPREFIX = /usr +INSTALLPRIVLIB = /usr/share/perl/5.22 +DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB) +INSTALLSITELIB = /usr/local/share/perl/5.22.1 +DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB) +INSTALLVENDORLIB = /usr/share/perl5 +DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB) +INSTALLARCHLIB = /usr/lib/x86_64-linux-gnu/perl/5.22 +DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB) +INSTALLSITEARCH = /usr/local/lib/x86_64-linux-gnu/perl/5.22.1 +DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH) +INSTALLVENDORARCH = /usr/lib/x86_64-linux-gnu/perl5/5.22 +DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH) +INSTALLBIN = /usr/bin +DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN) +INSTALLSITEBIN = /usr/local/bin +DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN) +INSTALLVENDORBIN = /usr/bin +DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN) +INSTALLSCRIPT = /usr/bin +DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT) +INSTALLSITESCRIPT = /usr/local/bin +DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT) +INSTALLVENDORSCRIPT = /usr/bin +DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT) +INSTALLMAN1DIR = /usr/share/man/man1 +DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR) +INSTALLSITEMAN1DIR = /usr/local/man/man1 +DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR) +INSTALLVENDORMAN1DIR = /usr/share/man/man1 +DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR) +INSTALLMAN3DIR = /usr/share/man/man3 +DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR) +INSTALLSITEMAN3DIR = /usr/local/man/man3 +DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR) +INSTALLVENDORMAN3DIR = /usr/share/man/man3 +DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR) +PERL_LIB = /usr/share/perl/5.22 +PERL_ARCHLIB = /usr/lib/x86_64-linux-gnu/perl/5.22 +PERL_ARCHLIBDEP = /usr/lib/x86_64-linux-gnu/perl/5.22 +LIBPERL_A = libperl.a +FIRST_MAKEFILE = Makefile +MAKEFILE_OLD = Makefile.old +MAKE_APERL_FILE = Makefile.aperl +PERLMAINCC = $(CC) +PERL_INC = /usr/lib/x86_64-linux-gnu/perl/5.22/CORE +PERL_INCDEP = /usr/lib/x86_64-linux-gnu/perl/5.22/CORE +PERL = "/usr/bin/perl" +FULLPERL = "/usr/bin/perl" +ABSPERL = $(PERL) +PERLRUN = $(PERL) +FULLPERLRUN = $(FULLPERL) +ABSPERLRUN = $(ABSPERL) +PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +PERL_CORE = 0 +PERM_DIR = 755 +PERM_RW = 644 +PERM_RWX = 755 + +MAKEMAKER = /usr/share/perl/5.22/ExtUtils/MakeMaker.pm +MM_VERSION = 7.0401 +MM_REVISION = 70401 + +# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle). +# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle) +# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar) +# DLBASE = Basename part of dynamic library. May be just equal BASEEXT. +MAKE = make +FULLEXT = KeplerAs/KeplerAs +BASEEXT = KeplerAs +PARENT_NAME = KeplerAs +DLBASE = $(BASEEXT) +VERSION_FROM = lib/KeplerAs/KeplerAs.pm +OBJECT = +LDFROM = $(OBJECT) +LINKTYPE = dynamic +BOOTDEP = + +# Handy lists of source code files: +XS_FILES = +C_FILES = +O_FILES = +H_FILES = +MAN1PODS = +MAN3PODS = lib/KeplerAs/KeplerAs.pm + +# Where is the Config information that we are using/depend on +CONFIGDEP = $(PERL_ARCHLIBDEP)$(DFSEP)Config.pm $(PERL_INCDEP)$(DFSEP)config.h + +# Where to build things +INST_LIBDIR = $(INST_LIB)/KeplerAs +INST_ARCHLIBDIR = $(INST_ARCHLIB)/KeplerAs + +INST_AUTODIR = $(INST_LIB)/auto/$(FULLEXT) +INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT) + +INST_STATIC = +INST_DYNAMIC = +INST_BOOT = + +# Extra linker info +EXPORT_LIST = +PERL_ARCHIVE = +PERL_ARCHIVEDEP = +PERL_ARCHIVE_AFTER = + + +TO_INST_PM = lib/KeplerAs/Cubin.pm \ + lib/KeplerAs/KeplerAs.pm \ + lib/KeplerAs/KeplerAsGrammar.pm + +PM_TO_BLIB = lib/KeplerAs/Cubin.pm \ + blib/lib/KeplerAs/Cubin.pm \ + lib/KeplerAs/KeplerAs.pm \ + blib/lib/KeplerAs/KeplerAs.pm \ + lib/KeplerAs/KeplerAsGrammar.pm \ + blib/lib/KeplerAs/KeplerAsGrammar.pm + + +# --- MakeMaker platform_constants section: +MM_Unix_VERSION = 7.0401 +PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc + + +# --- MakeMaker tool_autosplit section: +# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto +AUTOSPLITFILE = $(ABSPERLRUN) -e 'use AutoSplit; autosplit($$$$ARGV[0], $$$$ARGV[1], 0, 1, 1)' -- + + + +# --- MakeMaker tool_xsubpp section: + + +# --- MakeMaker tools_other section: +SHELL = /bin/sh +CHMOD = chmod +CP = cp +MV = mv +NOOP = $(TRUE) +NOECHO = @ +RM_F = rm -f +RM_RF = rm -rf +TEST_F = test -f +TOUCH = touch +UMASK_NULL = umask 0 +DEV_NULL = > /dev/null 2>&1 +MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' -- +EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' -- +FALSE = false +TRUE = true +ECHO = echo +ECHO_N = echo -n +UNINST = 0 +VERBINST = 0 +MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' -- +DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' -- +UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' -- +WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' -- +MACROSTART = +MACROEND = +USEMAKEFILE = -f +FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' -- +CP_NONEMPTY = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'cp_nonempty' -- + + +# --- MakeMaker makemakerdflt section: +makemakerdflt : all + $(NOECHO) $(NOOP) + + +# --- MakeMaker dist section: +TAR = tar +TARFLAGS = cvf +ZIP = zip +ZIPFLAGS = -r +COMPRESS = gzip --best +SUFFIX = .gz +SHAR = shar +PREOP = $(NOECHO) $(NOOP) +POSTOP = $(NOECHO) $(NOOP) +TO_UNIX = $(NOECHO) $(NOOP) +CI = ci -u +RCS_LABEL = rcs -Nv$(VERSION_SYM): -q +DIST_CP = best +DIST_DEFAULT = tardist +DISTNAME = KeplerAs-KeplerAs +DISTVNAME = KeplerAs-KeplerAs-1.06 + + +# --- MakeMaker macro section: + + +# --- MakeMaker depend section: + + +# --- MakeMaker cflags section: + + +# --- MakeMaker const_loadlibs section: + + +# --- MakeMaker const_cccmd section: + + +# --- MakeMaker post_constants section: + + +# --- MakeMaker pasthru section: + +PASTHRU = LIBPERL_A="$(LIBPERL_A)"\ + LINKTYPE="$(LINKTYPE)"\ + LD="$(LD)"\ + PREFIX="$(PREFIX)" + + +# --- MakeMaker special_targets section: +.SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT) + +.PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir + + + +# --- MakeMaker c_o section: + + +# --- MakeMaker xs_c section: + + +# --- MakeMaker xs_o section: + + +# --- MakeMaker top_targets section: +all :: pure_all manifypods + $(NOECHO) $(NOOP) + + +pure_all :: config pm_to_blib subdirs linkext + $(NOECHO) $(NOOP) + +subdirs :: $(MYEXTLIB) + $(NOECHO) $(NOOP) + +config :: $(FIRST_MAKEFILE) blibdirs + $(NOECHO) $(NOOP) + +help : + perldoc ExtUtils::MakeMaker + + +# --- MakeMaker blibdirs section: +blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists + $(NOECHO) $(NOOP) + +# Backwards compat with 6.18 through 6.25 +blibdirs.ts : blibdirs + $(NOECHO) $(NOOP) + +$(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_LIBDIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR) + $(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists + +$(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_ARCHLIB) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB) + $(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists + +$(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_AUTODIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR) + $(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists + +$(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR) + $(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists + +$(INST_BIN)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_BIN) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN) + $(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists + +$(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_SCRIPT) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT) + $(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists + +$(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_MAN1DIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR) + $(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists + +$(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_MAN3DIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR) + $(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists + + + +# --- MakeMaker linkext section: + +linkext :: $(LINKTYPE) + $(NOECHO) $(NOOP) + + +# --- MakeMaker dlsyms section: + + +# --- MakeMaker dynamic_bs section: + +BOOTSTRAP = + + +# --- MakeMaker dynamic section: + +dynamic :: $(FIRST_MAKEFILE) $(BOOTSTRAP) $(INST_DYNAMIC) + $(NOECHO) $(NOOP) + + +# --- MakeMaker dynamic_lib section: + + +# --- MakeMaker static section: + +## $(INST_PM) has been moved to the all: target. +## It remains here for awhile to allow for old usage: "make static" +static :: $(FIRST_MAKEFILE) $(INST_STATIC) + $(NOECHO) $(NOOP) + + +# --- MakeMaker static_lib section: + + +# --- MakeMaker manifypods section: + +POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--" +POD2MAN = $(POD2MAN_EXE) + + +manifypods : pure_all \ + lib/KeplerAs/KeplerAs.pm + $(NOECHO) $(POD2MAN) --section=$(MAN3EXT) --perm_rw=$(PERM_RW) -u \ + lib/KeplerAs/KeplerAs.pm $(INST_MAN3DIR)/KeplerAs::KeplerAs.$(MAN3EXT) + + + + +# --- MakeMaker processPL section: + + +# --- MakeMaker installbin section: + +EXE_FILES = bin/KeplerAs.pl + +pure_all :: $(INST_SCRIPT)/KeplerAs.pl + $(NOECHO) $(NOOP) + +realclean :: + $(RM_F) \ + $(INST_SCRIPT)/KeplerAs.pl + +$(INST_SCRIPT)/KeplerAs.pl : bin/KeplerAs.pl $(FIRST_MAKEFILE) $(INST_SCRIPT)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists + $(NOECHO) $(RM_F) $(INST_SCRIPT)/KeplerAs.pl + $(CP) bin/KeplerAs.pl $(INST_SCRIPT)/KeplerAs.pl + $(FIXIN) $(INST_SCRIPT)/KeplerAs.pl + -$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_SCRIPT)/KeplerAs.pl + + + +# --- MakeMaker subdirs section: + +# none + +# --- MakeMaker clean_subdirs section: +clean_subdirs : + $(NOECHO) $(NOOP) + + +# --- MakeMaker clean section: + +# Delete temporary files but do not touch installed files. We don't delete +# the Makefile here so a later make realclean still has a makefile to use. + +clean :: clean_subdirs + - $(RM_F) \ + $(BASEEXT).bso $(BASEEXT).def \ + $(BASEEXT).exp $(BASEEXT).x \ + $(BOOTSTRAP) $(INST_ARCHAUTODIR)/extralibs.all \ + $(INST_ARCHAUTODIR)/extralibs.ld $(MAKE_APERL_FILE) \ + *$(LIB_EXT) *$(OBJ_EXT) \ + *perl.core MYMETA.json \ + MYMETA.yml blibdirs.ts \ + core core.*perl.*.? \ + core.[0-9] core.[0-9][0-9] \ + core.[0-9][0-9][0-9] core.[0-9][0-9][0-9][0-9] \ + core.[0-9][0-9][0-9][0-9][0-9] lib$(BASEEXT).def \ + mon.out perl \ + perl$(EXE_EXT) perl.exe \ + perlmain.c pm_to_blib \ + pm_to_blib.ts so_locations \ + tmon.out + - $(RM_RF) \ + blib + $(NOECHO) $(RM_F) $(MAKEFILE_OLD) + - $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL) + + +# --- MakeMaker realclean_subdirs section: +realclean_subdirs : + $(NOECHO) $(NOOP) + + +# --- MakeMaker realclean section: +# Delete temporary files (via clean) and also delete dist files +realclean purge :: clean realclean_subdirs + - $(RM_F) \ + $(MAKEFILE_OLD) $(FIRST_MAKEFILE) + - $(RM_RF) \ + $(DISTVNAME) + + +# --- MakeMaker metafile section: +metafile : create_distdir + $(NOECHO) $(ECHO) Generating META.yml + $(NOECHO) $(ECHO) '---' > META_new.yml + $(NOECHO) $(ECHO) 'abstract: '\''Assembler for NVIDIA Maxwell architecture'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'author:' >> META_new.yml + $(NOECHO) $(ECHO) ' - '\''Xiuxia Zhang '\''' >> META_new.yml + $(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: '\''0'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: '\''0'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'dynamic_config: 1' >> META_new.yml + $(NOECHO) $(ECHO) 'generated_by: '\''ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'license: mit' >> META_new.yml + $(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml + $(NOECHO) $(ECHO) ' url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml + $(NOECHO) $(ECHO) ' version: '\''1.4'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'name: KeplerAs-KeplerAs' >> META_new.yml + $(NOECHO) $(ECHO) 'no_index:' >> META_new.yml + $(NOECHO) $(ECHO) ' directory:' >> META_new.yml + $(NOECHO) $(ECHO) ' - t' >> META_new.yml + $(NOECHO) $(ECHO) ' - inc' >> META_new.yml + $(NOECHO) $(ECHO) 'requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' Carp: '\''1.29'\''' >> META_new.yml + $(NOECHO) $(ECHO) ' Data::Dumper: '\''2.145'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'version: '\''1.06'\''' >> META_new.yml + -$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml + $(NOECHO) $(ECHO) Generating META.json + $(NOECHO) $(ECHO) '{' > META_new.json + $(NOECHO) $(ECHO) ' "abstract" : "Assembler for NVIDIA Maxwell architecture",' >> META_new.json + $(NOECHO) $(ECHO) ' "author" : [' >> META_new.json + $(NOECHO) $(ECHO) ' "Xiuxia Zhang "' >> META_new.json + $(NOECHO) $(ECHO) ' ],' >> META_new.json + $(NOECHO) $(ECHO) ' "dynamic_config" : 1,' >> META_new.json + $(NOECHO) $(ECHO) ' "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001",' >> META_new.json + $(NOECHO) $(ECHO) ' "license" : [' >> META_new.json + $(NOECHO) $(ECHO) ' "mit"' >> META_new.json + $(NOECHO) $(ECHO) ' ],' >> META_new.json + $(NOECHO) $(ECHO) ' "meta-spec" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",' >> META_new.json + $(NOECHO) $(ECHO) ' "version" : "2"' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "name" : "KeplerAs-KeplerAs",' >> META_new.json + $(NOECHO) $(ECHO) ' "no_index" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "directory" : [' >> META_new.json + $(NOECHO) $(ECHO) ' "t",' >> META_new.json + $(NOECHO) $(ECHO) ' "inc"' >> META_new.json + $(NOECHO) $(ECHO) ' ]' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "prereqs" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "build" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "requires" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "ExtUtils::MakeMaker" : "0"' >> META_new.json + $(NOECHO) $(ECHO) ' }' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "configure" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "requires" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "ExtUtils::MakeMaker" : "0"' >> META_new.json + $(NOECHO) $(ECHO) ' }' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "runtime" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "requires" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "Carp" : "1.29",' >> META_new.json + $(NOECHO) $(ECHO) ' "Data::Dumper" : "2.145"' >> META_new.json + $(NOECHO) $(ECHO) ' }' >> META_new.json + $(NOECHO) $(ECHO) ' }' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "release_status" : "stable",' >> META_new.json + $(NOECHO) $(ECHO) ' "version" : "1.06"' >> META_new.json + $(NOECHO) $(ECHO) '}' >> META_new.json + -$(NOECHO) $(MV) META_new.json $(DISTVNAME)/META.json + + +# --- MakeMaker signature section: +signature : + cpansign -s + + +# --- MakeMaker dist_basics section: +distclean :: realclean distcheck + $(NOECHO) $(NOOP) + +distcheck : + $(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck + +skipcheck : + $(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck + +manifest : + $(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest + +veryclean : realclean + $(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old + + + +# --- MakeMaker dist_core section: + +dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE) + $(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \ + -e ' if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' -- + +tardist : $(DISTVNAME).tar$(SUFFIX) + $(NOECHO) $(NOOP) + +uutardist : $(DISTVNAME).tar$(SUFFIX) + uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu + $(NOECHO) $(ECHO) 'Created $(DISTVNAME).tar$(SUFFIX)_uu' + +$(DISTVNAME).tar$(SUFFIX) : distdir + $(PREOP) + $(TO_UNIX) + $(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME) + $(RM_RF) $(DISTVNAME) + $(COMPRESS) $(DISTVNAME).tar + $(NOECHO) $(ECHO) 'Created $(DISTVNAME).tar$(SUFFIX)' + $(POSTOP) + +zipdist : $(DISTVNAME).zip + $(NOECHO) $(NOOP) + +$(DISTVNAME).zip : distdir + $(PREOP) + $(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME) + $(RM_RF) $(DISTVNAME) + $(NOECHO) $(ECHO) 'Created $(DISTVNAME).zip' + $(POSTOP) + +shdist : distdir + $(PREOP) + $(SHAR) $(DISTVNAME) > $(DISTVNAME).shar + $(RM_RF) $(DISTVNAME) + $(NOECHO) $(ECHO) 'Created $(DISTVNAME).shar' + $(POSTOP) + + +# --- MakeMaker distdir section: +create_distdir : + $(RM_RF) $(DISTVNAME) + $(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \ + -e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');" + +distdir : create_distdir distmeta + $(NOECHO) $(NOOP) + + + +# --- MakeMaker dist_test section: +disttest : distdir + cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL + cd $(DISTVNAME) && $(MAKE) $(PASTHRU) + cd $(DISTVNAME) && $(MAKE) test $(PASTHRU) + + + +# --- MakeMaker dist_ci section: + +ci : + $(PERLRUN) "-MExtUtils::Manifest=maniread" \ + -e "@all = keys %{ maniread() };" \ + -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \ + -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});" + + +# --- MakeMaker distmeta section: +distmeta : create_distdir metafile + $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'exit unless -e q{META.yml};' \ + -e 'eval { maniadd({q{META.yml} => q{Module YAML meta-data (added by MakeMaker)}}) }' \ + -e ' or print "Could not add META.yml to MANIFEST: $$$${'\''@'\''}\n"' -- + $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'exit unless -f q{META.json};' \ + -e 'eval { maniadd({q{META.json} => q{Module JSON meta-data (added by MakeMaker)}}) }' \ + -e ' or print "Could not add META.json to MANIFEST: $$$${'\''@'\''}\n"' -- + + + +# --- MakeMaker distsignature section: +distsignature : create_distdir + $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) }' \ + -e ' or print "Could not add SIGNATURE to MANIFEST: $$$${'\''@'\''}\n"' -- + $(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE + cd $(DISTVNAME) && cpansign -s + + + +# --- MakeMaker install section: + +install :: pure_install doc_install + $(NOECHO) $(NOOP) + +install_perl :: pure_perl_install doc_perl_install + $(NOECHO) $(NOOP) + +install_site :: pure_site_install doc_site_install + $(NOECHO) $(NOOP) + +install_vendor :: pure_vendor_install doc_vendor_install + $(NOECHO) $(NOOP) + +pure_install :: pure_$(INSTALLDIRS)_install + $(NOECHO) $(NOOP) + +doc_install :: doc_$(INSTALLDIRS)_install + $(NOECHO) $(NOOP) + +pure__install : pure_site_install + $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site + +doc__install : doc_site_install + $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site + +pure_perl_install :: all + $(NOECHO) umask 022; $(MOD_INSTALL) \ + "$(INST_LIB)" "$(DESTINSTALLPRIVLIB)" \ + "$(INST_ARCHLIB)" "$(DESTINSTALLARCHLIB)" \ + "$(INST_BIN)" "$(DESTINSTALLBIN)" \ + "$(INST_SCRIPT)" "$(DESTINSTALLSCRIPT)" \ + "$(INST_MAN1DIR)" "$(DESTINSTALLMAN1DIR)" \ + "$(INST_MAN3DIR)" "$(DESTINSTALLMAN3DIR)" + $(NOECHO) $(WARN_IF_OLD_PACKLIST) \ + "$(SITEARCHEXP)/auto/$(FULLEXT)" + + +pure_site_install :: all + $(NOECHO) umask 02; $(MOD_INSTALL) \ + read "$(SITEARCHEXP)/auto/$(FULLEXT)/.packlist" \ + write "$(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist" \ + "$(INST_LIB)" "$(DESTINSTALLSITELIB)" \ + "$(INST_ARCHLIB)" "$(DESTINSTALLSITEARCH)" \ + "$(INST_BIN)" "$(DESTINSTALLSITEBIN)" \ + "$(INST_SCRIPT)" "$(DESTINSTALLSITESCRIPT)" \ + "$(INST_MAN1DIR)" "$(DESTINSTALLSITEMAN1DIR)" \ + "$(INST_MAN3DIR)" "$(DESTINSTALLSITEMAN3DIR)" + $(NOECHO) $(WARN_IF_OLD_PACKLIST) \ + "$(PERL_ARCHLIB)/auto/$(FULLEXT)" + +pure_vendor_install :: all + $(NOECHO) umask 022; $(MOD_INSTALL) \ + "$(INST_LIB)" "$(DESTINSTALLVENDORLIB)" \ + "$(INST_ARCHLIB)" "$(DESTINSTALLVENDORARCH)" \ + "$(INST_BIN)" "$(DESTINSTALLVENDORBIN)" \ + "$(INST_SCRIPT)" "$(DESTINSTALLVENDORSCRIPT)" \ + "$(INST_MAN1DIR)" "$(DESTINSTALLVENDORMAN1DIR)" \ + "$(INST_MAN3DIR)" "$(DESTINSTALLVENDORMAN3DIR)" + + +doc_perl_install :: all + +doc_site_install :: all + $(NOECHO) $(ECHO) Appending installation info to "$(DESTINSTALLSITEARCH)/perllocal.pod" + -$(NOECHO) umask 02; $(MKPATH) "$(DESTINSTALLSITEARCH)" + -$(NOECHO) umask 02; $(DOC_INSTALL) \ + "Module" "$(NAME)" \ + "installed into" $(INSTALLSITELIB) \ + LINKTYPE "$(LINKTYPE)" \ + VERSION "$(VERSION)" \ + EXE_FILES "$(EXE_FILES)" \ + >> "$(DESTINSTALLSITEARCH)/perllocal.pod" + +doc_vendor_install :: all + + +uninstall :: uninstall_from_$(INSTALLDIRS)dirs + $(NOECHO) $(NOOP) + +uninstall_from_perldirs :: + +uninstall_from_sitedirs :: + $(NOECHO) $(UNINSTALL) "$(SITEARCHEXP)/auto/$(FULLEXT)/.packlist" + +uninstall_from_vendordirs :: + + +# --- MakeMaker force section: +# Phony target to force checking subdirectories. +FORCE : + $(NOECHO) $(NOOP) + + +# --- MakeMaker perldepend section: + + +# --- MakeMaker makefile section: +# We take a very conservative approach here, but it's worth it. +# We move Makefile to Makefile.old here to avoid gnu make looping. +$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP) + $(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?" + $(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..." + -$(NOECHO) $(RM_F) $(MAKEFILE_OLD) + -$(NOECHO) $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) + - $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL) + $(PERLRUN) Makefile.PL + $(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <==" + $(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command. <==" + $(FALSE) + + + +# --- MakeMaker staticmake section: + +# --- MakeMaker makeaperl section --- +MAP_TARGET = perl +FULLPERL = "/usr/bin/perl" + +$(MAP_TARGET) :: static $(MAKE_APERL_FILE) + $(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@ + +$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib + $(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET) + $(NOECHO) $(PERLRUNINST) \ + Makefile.PL DIR="" \ + MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \ + MAKEAPERL=1 NORECURS=1 CCCDLFLAGS= + + +# --- MakeMaker test section: + +TEST_VERBOSE=0 +TEST_TYPE=test_$(LINKTYPE) +TEST_FILE = test.pl +TEST_FILES = +TESTDB_SW = -d + +testdb :: testdb_$(LINKTYPE) + +test :: $(TEST_TYPE) subdirs-test + +subdirs-test :: + $(NOECHO) $(NOOP) + + $(NOECHO) $(ECHO) 'No tests defined for $(NAME) extension.' + +test_dynamic :: pure_all + +testdb_dynamic :: pure_all + PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE) + +test_ : test_dynamic + +test_static :: test_dynamic +testdb_static :: testdb_dynamic + + +# --- MakeMaker ppd section: +# Creates a PPD (Perl Package Description) for a binary distribution. +ppd : + $(NOECHO) $(ECHO) '' > $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' Assembler for NVIDIA Maxwell architecture' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' Xiuxia Zhang <zhangxiuxia1@gmail.com>' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) '' >> $(DISTNAME).ppd + + +# --- MakeMaker pm_to_blib section: + +pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM) + $(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \ + lib/KeplerAs/Cubin.pm blib/lib/KeplerAs/Cubin.pm \ + lib/KeplerAs/KeplerAs.pm blib/lib/KeplerAs/KeplerAs.pm \ + lib/KeplerAs/KeplerAsGrammar.pm blib/lib/KeplerAs/KeplerAsGrammar.pm + $(NOECHO) $(TOUCH) pm_to_blib + + +# --- MakeMaker selfdocument section: + + +# --- MakeMaker postamble section: + + +# End. diff --git a/Assembler/KeplerAs/Makefile.PL b/Assembler/KeplerAs/Makefile.PL new file mode 100644 index 0000000..4f71756 --- /dev/null +++ b/Assembler/KeplerAs/Makefile.PL @@ -0,0 +1,14 @@ +require 5.10.0; +use ExtUtils::MakeMaker; +# See lib/ExtUtils/MakeMaker.pm for details of how to influence +# the contents of the Makefile that is written. +WriteMakefile( + NAME => 'KeplerAs::KeplerAs', + VERSION_FROM => 'lib/KeplerAs/KeplerAs.pm', # finds $VERSION + EXE_FILES => ['bin/KeplerAs.pl'], + PREREQ_PM => {Carp => 1.29, Data::Dumper => 2.145}, + LICENSE => 'MIT', + ($] >= 5.005 ? ## Add these new keywords supported since 5.005 + (ABSTRACT_FROM => 'lib/KeplerAs/KeplerAs.pm', # retrieve abstract from module + AUTHOR => 'Xiuxia Zhang ') : ()), +); diff --git a/Assembler/KeplerAs/README.md b/Assembler/KeplerAs/README.md new file mode 100644 index 0000000..7b4a8ee --- /dev/null +++ b/Assembler/KeplerAs/README.md @@ -0,0 +1,7 @@ +##Kepler GPU assembler: KeplerAs + +Our KeplerAs is based on Maxas(for Maxwell and Pascal GPU). +Kepler use a completely different ISA incodings compared with Maxwell GPU. +We use the ISA encoding information cracked by our solver. + +Install.sh is script to install the software. diff --git a/Assembler/KeplerAs/bin/KeplerAs.pl b/Assembler/KeplerAs/bin/KeplerAs.pl new file mode 100755 index 0000000..268cc85 --- /dev/null +++ b/Assembler/KeplerAs/bin/KeplerAs.pl @@ -0,0 +1,275 @@ +#!/usr/bin/perl +use strict; +use KeplerAs::Cubin; +use KeplerAs::KeplerAs; +use Data::Dumper; +use File::Spec; + +require 5.10.0; + +$Data::Dumper::Sortkeys = 1; + +my $mode = shift; + +if ($mode =~ /^\-?\-l/i) +{ + my $cubinFile = shift or usage(); + + my $cubin = KeplerAs::Cubin->new($cubinFile); + + my $arch = $cubin->arch; + my $class = $cubin->class; + my $asize = $cubin->address_size; + my $kernels = $cubin->listKernels; + my $symbols = $cubin->listSymbols; + + printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize; + + foreach my $ker (sort keys %$kernels) + { + printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)}; + } + foreach my $sym (sort keys %$symbols) + { + printf "Symbol: %s\n", $sym; + } +} +elsif ($mode =~ /^\-?\-t/i) +{ + my $reg = shift if $ARGV[0] =~ /^\-?\-r/i; + my $all = shift if $ARGV[0] =~ /^\-?\-a/i; + my $file = shift or usage(); + my $fh; + if (-T $file) + { + open $fh, $file or die "$file: $!"; + } + else + { + my $cubin = KeplerAs::Cubin->new($file); + my $arch = $cubin->arch; + + open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!"; + my $first = <$fh>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + } + exit(KeplerAs::KeplerAs::Test($fh, $reg, $all) ? 1 : 0); +} +elsif ($mode =~ /^\-?\-e/i) +{ + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $cubinFile = shift or usage(); + my $asmFile = shift; + my $cubin = KeplerAs::Cubin->new($cubinFile); + my $arch = $cubin->arch; + my $kernels = $cubin->listKernels; + + $kernelName ||= (sort keys %$kernels)[0]; + + my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName"; + + open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_35 -sass -fun $kernelName $cubinFile: $!"; + my $first = <$in>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n"; + + print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt)); + + print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n"; + + print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params}; + + print $out "#\n# Instructions:\n\n"; + + KeplerAs::KeplerAs::Extract($in, $out, $kernel->{Params}); + + close $out if $asmFile; + close $in; +} +elsif ($mode =~ /^\-?\-s/i) +{ + my $sassFile = shift or usage(); + my $asmFile = shift; + + open my $in, $sassFile or die "$sassFile: $!"; + + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + KeplerAs::KeplerAs::Extract($in, $out, []); + + close $out if $asmFile; + close $in; +} +elsif ($mode =~ /^\-?\-i/i) +{ + my $nowarn; + if ($ARGV[0] =~ /^\-?\-w/i) + { + $nowarn = shift; + } + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $noReuse = shift if $ARGV[0] =~ /^\-?\-n/i; + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package KeplerAs::KeplerAs::CODE; our \$$name = '$value';" + } + + my $asmFile = shift or usage(); + my $cubinFile = shift or usage(); + my $newCubin = shift || $cubinFile; + + my $file; + if (open my $fh, $asmFile) + { + local $/; + $file = <$fh>; + close $fh; + } + else { die "$asmFile: $!" } + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName; + die "asm file missing kernel name or is badly formatted" unless $kernelName; + + my $kernel = KeplerAs::KeplerAs::Assemble($file, $include, !$noReuse, $nowarn); + + my $cubin = KeplerAs::Cubin->new($cubinFile); + $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName"; + + $cubin->modifyKernel(%$kernel); + + $cubin->write($newCubin); + + printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n", + @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)}; + +} +elsif ($mode =~ /^\-?\-p/i) +{ + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package KeplerAs::KeplerAs::CODE; our \$$name = '$value';"; + } + my $debug = shift if $ARGV[0] =~ /^\-?\-d/i; + my $asmFile = shift or usage(); + my $asmFile2 = shift; + + die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2; + + open my $fh, $asmFile or die "$asmFile: $!"; + local $/; + my $file = <$fh>; + close $fh; + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + if ($asmFile2) + { + open $fh, ">$asmFile2" or die "$asmFile2: $!"; + } + else + { + $fh = \*STDOUT; + } + print $fh KeplerAs::KeplerAs::Preprocess($file, $include, $debug); + close $fh; +} +elsif ($mode =~ /^\-?\-v/i) +{ + print "$KeplerAs::KeplerAs::VERSION\n"; +} +else +{ + print "$mode\n"; + usage(); +} + +exit(0); + + + +sub usage +{ + print < + + Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes. + Also useful for extending the missing grammar rules. Defaults to only showing failures without --all. + With the --reg flag it will show register bank conflicts not hidden by reuse flags. + + KeplerAs.pl --test|-t [--reg|-r] [--all|-a] + + Extract a single kernel into an asm file from a cubin. + Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin. + + KeplerAs.pl --extract|-e [--kernel|-k kernel_name] [asm_file] + + Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes. + Include the debug flag to print out detailed scheduler info. + + KeplerAs.pl --pre|-p [--debug|-d] [new_asm_file] + + Insert the kernel asm back into the cubin. Overwrite existing or create new cubin. + Optionally you can skip register reuse flag auto insertion. This allows you to observe + performance without any reuse or you can use it to set the flags manually in your sass. + + KeplerAs.pl --insert|-i [--noreuse|-n] [new_cubin_file] + + Display version information and exit: + + KeplerAs.pl --version|-v + +EOF + exit(1); +} + +__END__ diff --git a/Assembler/KeplerAs/blib/arch/.exists b/Assembler/KeplerAs/blib/arch/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/KeplerAs/blib/arch/auto/KeplerAs/KeplerAs/.exists b/Assembler/KeplerAs/blib/arch/auto/KeplerAs/KeplerAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/KeplerAs/blib/bin/.exists b/Assembler/KeplerAs/blib/bin/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/KeplerAs/blib/lib/KeplerAs/.exists b/Assembler/KeplerAs/blib/lib/KeplerAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/KeplerAs/blib/lib/KeplerAs/Cubin.pm b/Assembler/KeplerAs/blib/lib/KeplerAs/Cubin.pm new file mode 100644 index 0000000..867342d --- /dev/null +++ b/Assembler/KeplerAs/blib/lib/KeplerAs/Cubin.pm @@ -0,0 +1,604 @@ +package KeplerAs::Cubin; + +use strict; +use Data::Dumper; + +my @Elf32_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + L entry + L phOffset + L shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf64_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + Q entry + Q phOffset + Q shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf32_PrgHdr = qw( + L type + L offset + L vaddr + L paddr + L fileSize + L memSize + L flags + L align +); +my @Elf64_PrgHdr = qw( + L type + L flags + Q offset + Q vaddr + Q paddr + Q fileSize + Q memSize + Q align +); +my @Elf32_SecHdr = qw( + L name + L type + L flags + L addr + L offset + L size + L link + L info + L align + L entSize +); +my @Elf64_SecHdr = qw( + L name + L type + Q flags + Q addr + Q offset + Q size + L link + L info + Q align + Q entSize +); +my @Elf32_SymEnt = qw( + L name + L value + L size + C info + C other + S shIndx +); +my @Elf64_SymEnt = qw( + L name + C info + C other + S shIndx + Q value + Q size +); +my @symBind = qw(LOCAL GLOBAL WEAK); + +my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC); + +$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr; +$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr; +$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr; +$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt; + +$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr; +$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr; +$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr; +$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt; + +$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr ]; +$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ]; +$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ]; +$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ]; + +$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr ]; +$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ]; +$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ]; +$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ]; + +sub new +{ + my ($package, $file) = @_; + + my $cubin = bless { fileName => $file }, $package; + + open my $fh, $file or die "$file: $!"; + binmode($fh); + + my $data; + read $fh, $data, 0x34; + my $elfHdr = $cubin->{elfHdr} = {}; + @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data; + + my $class = $elfHdr->{fileClass}; + + if ($class == 2) + { + seek $fh, 0, 0; + read $fh, $data, 0x46; + @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data; + + $cubin->{Class} = 64; + } + else + { + $cubin->{Class} = 32; + } + + $cubin->{Arch} = "35"; + die "Cubin not in sm_35. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} != 35; + $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32; + + seek $fh, $elfHdr->{phOffset}, 0; + foreach (1 .. $elfHdr->{phNum}) + { + read $fh, $data, $elfHdr->{phEntSize}; + + my %prgHdr = (Indx => $_ - 1); + @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data; + push @{$cubin->{prgHdrs}}, \%prgHdr; + } + + seek $fh, $elfHdr->{shOffset}, 0; + foreach (1 .. $elfHdr->{shNum}) + { + read $fh, $data, $elfHdr->{shEntSize}; + + my %secHdr = (Indx => $_ - 1); + @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data; + push @{$cubin->{secHdrs}}, \%secHdr; + } + + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $data = ''; + if ($secHdr->{size} && $secHdr->{type} != 8) + { + seek $fh, $secHdr->{offset}, 0; + read $fh, $data, $secHdr->{size}; + } + if ($secHdr->{type} == 3) # STRTAB + { + my $strTab = $secHdr->{StrTab} = {}; + my $indx = 0; + foreach my $str (split "\0", $data) + { + $strTab->{$indx} = $str; + $indx += 1 + length($str); + } + } + if ($secHdr->{type} == 2) # SYMTAB + { + my $offset = 0; + while ($offset < $secHdr->{size}) + { + my $symEnt = {}; + @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize}); + $offset += $secHdr->{entSize}; + + push @{$secHdr->{SymTab}}, $symEnt; + } + } + $secHdr->{Data} = unpack 'H*', $data; + } + close $fh; + + my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab}; + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $secHdr->{Name} = $shStrTab->{$secHdr->{name}}; + $cubin->{$secHdr->{Name}} = $secHdr; + } + + my $strTab = $cubin->{'.strtab'}{StrTab}; + foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}}) + { + $symEnt->{Name} = $strTab->{$symEnt->{name}}; + + my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}]; + $secHdr->{SymbolEnt} = $symEnt; + + if (($symEnt->{info} & 0x0f) == 0x02) + { + my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr; + + $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4]; + + $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ]; + + $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20; + + $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24; + + my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"}; + $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0; + + $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"}; + + my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"}; + if ($paramSec) + { + my @data = unpack "L*", pack "H*", $paramSec->{Data}; + + $paramSec->{ParamData} = \@data; + $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ]; + + my $idx = 0; + $idx++ while $idx < @data && $data[$idx] != 0x00080a04; + + my $first = $data[$idx+2] & 0xFFFF; + $idx += 4; + + my @params; + while ($idx < @data && $data[$idx] == 0x000c1704) + { + my $ord = $data[$idx+2] & 0xFFFF; + my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16); + my $psize = $data[$idx+3] >> 18; + my $align = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0; + unshift @params, "$ord:$offset:$psize:$align"; + $idx += 4; + } + my @staticParams = @data[0 .. ($idx-1)]; + + + my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize); + while ($idx < @data) + { + my $code = $data[$idx] & 0xffff; + my $size = $data[$idx] >> 16; + $idx++; + + + if ($code == 0x1b03) + { + $maxregCount = $size; + } + elsif ($code == 0x1d04) + { + while ($size > 0) + { + push @ctaidOffsets, $data[$idx++]; + $size -= 4; + } + } + elsif ($code == 0x1c04) + { + while ($size > 0) + { + push @exitOffsets, $data[$idx++]; + $size -= 4; + } + } + elsif ($code == 0x0401) + { + $ctaidzUsed = 1; + } + elsif ($code == 0x1004) + { + while ($size > 0) + { + push @reqntid, $data[$idx++]; + $size -= 4; + } + } + elsif ($code == 0x0504) + { + while ($size > 0) + { + push @maxntid, $data[$idx++]; + $size -= 4; + } + } + elsif ($code == 0x1e04) + { + while ($size > 0) + { + push @stackSize, $data[$idx++]; + $size -= 4; + } + } + else + { + printf "Unknown Code 0x%02x (size:%d)\n", $code, $size; + } + } + $kernelSec->{Params} = \@params; + $kernelSec->{ParamCnt} = scalar @params; + + $paramSec->{StaticParams} = \@staticParams; + $paramSec->{MAXREG_COUNT} = $maxregCount; + $paramSec->{ExitOffsets} = \@exitOffsets; + $paramSec->{CTAIDOffsets} = \@ctaidOffsets; + $paramSec->{CTAIDZUsed} = $ctaidzUsed; + $paramSec->{REQNTID} = \@reqntid; + $paramSec->{MAXNTID} = \@maxntid; + $paramSec->{STACKSIZE} = \@stackSize; + } + } + elsif (($symEnt->{info} & 0x10) == 0x10) + { + $cubin->{Symbols}{$symEnt->{Name}} = $symEnt; + } + } + + + return $cubin; +} +sub class +{ + return shift()->{Class}; +} +sub arch +{ + return shift()->{Arch}; +} +sub address_size +{ + return shift()->{AddressSize}; +} +sub listKernels +{ + return shift()->{Kernels}; +} +sub listSymbols +{ + return shift()->{Symbols}; +} +sub getKernel +{ + my ($cubin, $kernel) = @_; + return $cubin->{Kernels}{$kernel}; +} + +sub modifyKernel +{ + my ($cubin, %params) = @_; + + my $kernelSec = $params{Kernel}; + my $newReg = $params{RegCnt}; + my $newBar = $params{BarCnt}; + my $exitOffsets = $params{ExitOffsets}; + my $ctaidOffsets = $params{CTAIDOffsets}; + my $ctaidzUsed = $params{CTAIDZUsed}; + my $newData = $params{KernelData}; + my $newSize = @$newData * 8; + + die "255 register max" if $newReg > 255; + die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63; + die "16 is max barrier count" if $newBar > 16; + + my $paramSec = $kernelSec->{ParamSec}; + my $kernelName = $kernelSec->{SymbolEnt}{Name}; + my $maxregCount = $paramSec->{MAXREG_COUNT}; + my $stackSize = $paramSec->{STACKSIZE}; + + $kernelSec->{KernelData} = $newData; + $kernelSec->{Data} = unpack "H*", pack "Q*", @$newData; + + if ($newReg != $kernelSec->{RegCnt}) + { + print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n"; + $kernelSec->{RegCnt} = $newReg; + $kernelSec->{info} &= ~0xff000000; + $kernelSec->{info} |= $newReg << 24; + } + if ($newBar != $kernelSec->{BarCnt}) + { + print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n"; + $kernelSec->{BarCnt} = $newBar; + $kernelSec->{flags} &= ~0x01f00000; + $kernelSec->{flags} |= $newBar << 20; + } + + my @paramData = @{$paramSec->{StaticParams}}; + if (defined $maxregCount) + { + push @paramData, ($maxregCount << 16) | 0x1b03; + } + + + my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets; + my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}}; + + if ($newCTAIDs ne $oldCTAIDs) + { + print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n"; + } + if (@$ctaidOffsets) + { + push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04; + push @paramData, @$ctaidOffsets; + } + + my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets; + my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}}; + + if ($newExits ne $oldExits) + { + print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n"; + } + if (@$exitOffsets) + { + push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04; + push @paramData, @$exitOffsets; + } + + if ($ctaidzUsed != $paramSec->{CTAIDZUsed}) + { + print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n"; + } + if ($ctaidzUsed) + { + push @paramData, 0x0401; + } + + if (@{$paramSec->{REQNTID}}) + { + push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004; + push @paramData, @{$paramSec->{REQNTID}}; + } + if (@{$paramSec->{MAXNTID}}) + { + push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504; + push @paramData, @{$paramSec->{MAXNTID}}; + } + if (@$stackSize) + { + push @paramData, (scalar(@$stackSize) << 18) | 0x1e04; + push @paramData, @$stackSize; + } + + my $newParamSize = scalar(@paramData)*4; + $paramSec->{Data} = unpack "H*", pack "L*", @paramData; + if ($newParamSize != $paramSec->{size}) + { + print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n"; + $cubin->updateSize($paramSec, $newParamSize); + } + + if ($newSize != $kernelSec->{size}) + { + print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n"; + $cubin->updateSize($kernelSec, $newSize, 1); + } +} + +sub updateSize +{ + my ($cubin, $sec, $newSize, $updatePrgSize) = @_; + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + my $delta = $newSize - $sec->{size}; + $sec->{size} = $newSize; + + if ($sec->{SymbolEnt}) + { + $sec->{SymbolEnt}{size} = $newSize; + my $symSection = $cubin->{'.symtab'}; + $symSection->{Data} = ''; + foreach my $symEnt (@{$symSection->{SymTab}}) + { + $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}}; + } + } + + my $pos = $elfHdr->{ehSize}; + my %sizeMap; + + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + next if $secHdr->{align} == 0; + + my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size}; + + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pos += $secHdr->{align} - $pad; + } + $sizeMap{$secHdr->{offset}} = $pos; + + $secHdr->{offset} = $pos; + + $pos += $size; + } + + my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset}; + + $sizeMap{$elfHdr->{shOffset}} = $pos; + $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize; + + $elfHdr->{shOffset} = $pos; + $elfHdr->{phOffset} = $pos + $shSize; + + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}}; + + if ($updatePrgSize && $prgHdr->{type} == 1 && + $sec->{offset} >= $prgHdr->{offset} && + $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta) + { + $prgHdr->{fileSize} += $delta; + $prgHdr->{memSize} += $delta; + } + } +} + +sub write +{ + my ($cubin, $file) = @_; + + open my $fh, ">$file" or die "Error: could not open $file for writing: $!"; + binmode($fh); + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}}; + my $pos = $elfHdr->{ehSize}; + + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + next if $secHdr->{size} == 0 || $secHdr->{type} == 8; + + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pad = $secHdr->{align} - $pad; + print $fh join '', "\0" x $pad; + $pos += $pad; + } + + print $fh pack 'H*', $secHdr->{Data}; + $pos += $secHdr->{size}; + } + + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}}; + } + + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}}; + } + close $fh; +} + +__END__ + diff --git a/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAs.pm b/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAs.pm new file mode 100644 index 0000000..34dfbcd --- /dev/null +++ b/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAs.pm @@ -0,0 +1,1196 @@ +package KeplerAs::KeplerAs; + +require 5.10.0; + +use strict; +use Data::Dumper; +use KeplerAs::KeplerAsGrammar; +use File::Spec; +use Carp; + +our $VERSION = '1.06'; + +my %relOffset = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT); + +my %absOffset = map { $_ => 1 } qw(JCAL); + +my %jumpOp = (%relOffset, %absOffset); + +my %noDest = map { $_ => 1 } qw(ST STG STS STL RED); + +my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4); + +sub Assemble +{ + my ($file, $include, $doReuse, $nowarn) = @_; + + my $regMap = {}; + $file = Preprocess($file, $include, 0, $regMap); + my $vectors = delete $regMap->{__vectors}; + my $regBank = delete $regMap->{__regbank}; + + my $regCnt = 0; + my $barCnt = 0; + + my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse); + + push @instructs, $ctrl = {}; + + foreach my $line (split "\n", $file) + { + $lineNum++; + + next unless preProcessLine($line); + + if (my $inst = processAsmLine($line, $lineNum)) + { + + push @branches, @instructs+0 if exists $jumpOp{$inst->{op}}; + + push @{$ctrl->{ctrl}}, $inst->{ctrl}; + + $inst->{ctrl} = $ctrl; + + push @instructs, $inst; + push @instructs, $ctrl = {} if ((@instructs & 7) == 0); + } + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + $labels{$1} = @instructs+0; + } + else + { + die "badly formed line at $lineNum: $line\n"; + } + } + push @{$ctrl->{ctrl}}, 0x00; + push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' }; + while (@instructs & 7) + { + push @instructs, $ctrl = {} if ((@instructs & 7) == 0); + push @{$ctrl->{ctrl}}, 0x00; + push @instructs, { op => 'NOP', inst => 'NOP;' }; + } + + foreach my $i (@branches) + { + if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1}) + { die "instruction has invalid label: $instructs[$i]{inst}"; } + + $instructs[$i]{jump} = $labels{$1}; + + if (exists $relOffset{$instructs[$i]{op}}) + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; } + else + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; } + } + + foreach my $i (0 .. $#instructs) + { + next unless $i & 7; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + my $capData = parseInstruct($inst, $gram) or next; + + if ($doReuse) + { + my @r0 = getVecRegisters($vectors, $capData); + + + if (@r0 && !exists $noDest{$op}) + { + foreach my $slot (keys %reuseSlots) + { + if (my $reuse = $reuse{$slot}) + { + delete $reuse->{$_} foreach @r0; + } + } + } + %reuse = () if exists $jumpOp{$op}; + + if ($gram->{type}{reuse}) + { + foreach my $slot (keys %reuseSlots) + { + next unless exists $capData->{$slot}; + + my $r = $capData->{$slot}; + next if $r eq 'RZ'; + next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction + + my $reuse = $reuse{$slot} ||= {}; + + if (my $p = $reuse->{$r}) + { + $instructs[$p]{ctrl}{reuse}[($p & 7) - 1] |= $reuseSlots{$slot}; + + } + elsif (keys %$reuse > 2) + { + my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0]; + delete $reuse->{$oldest}; + } + $reuse->{$r} = $i; + } + } + } + elsif ($gram->{type}{reuse}) + { + $ctrl->{reuse}[($i & 7) - 1] = genReuseCode($capData); + } + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + foreach my $r (sort keys %$regBank) + { + my $bank = $regBank->{$r}; + my $avail = $regMap->{$r}; + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 7)) + { + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + last; + } + } + } + + my (%liveTime, %pairedBanks, %reuseHistory); + foreach my $i (0 .. $#instructs) + { + next unless $i & 7; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + my $capData = parseInstruct($inst, $gram) or next; + my $reuseType = $gram->{type}{reuse}; + + my (%addReuse, %delReuse); + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r}; + + if (my $liveTime = $liveTime{$liveR}) + { + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + + my $slotHist = $reuseHistory{$slot} ||= {}; + my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0; + + + if (!$selfReuse && ref $regMap->{$r}) + { + foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39)) + { + my $r2 = $capData->{$slot2}; + next if $r2 eq 'RZ' || $r2 eq $r; + + my $slotHist2 = $reuseHistory{$slot2} ||= {}; + + + if (!$reuseType || !exists $slotHist2->{$r2}) + { + if (ref $regMap->{$r2}) + { + push @{$pairedBanks{$r}{pairs}}, $r2; + $pairedBanks{$r}{banks} ||= []; + } + else + { + my $bank = substr($regMap->{$r2},1) & 7; + + $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++; + $pairedBanks{$r}{pairs} ||= []; + } + $pairedBanks{$r}{useCnt}++; + } + } + } + if ($reuseType) + { + if ($ctrl->{reuse}[($i & 7) - 1] & $reuseSlots{$slot}) + { $addReuse{$slot} = $r; } + else + { $delReuse{$slot} = $r; } + } + } + $reuseHistory{$_}{$addReuse{$_}} = 1 foreach keys %addReuse; + delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse; + + foreach my $r0 (getVecRegisters($vectors, $capData)) + { + my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0}; + + if (exists $noDest{$op}) + { + if (my $liveTime = $liveTime{$liveR}) + { + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r0): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + } + elsif (my $liveTime = $liveTime{$liveR}) + { + if ($i > $liveTime->[$#$liveTime][1]) + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + else + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + foreach my $r (sort { + $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} || + $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} || + $a cmp $b + } keys %pairedBanks) + { + my $banks = $pairedBanks{$r}{banks}; + my $avail = $regMap->{$r}; + + + BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..7)) + { + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 7)) + { + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + + $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}}; + last BANK; + } + } + } + } + foreach my $r (sort keys %$regMap) + { + if (ref($regMap->{$r}) eq 'ARRAY') + { + $regMap->{$r} = 'R' . shift @{$regMap->{$r}}; + } + } + + foreach my $i (0 .. $#instructs) + { + next unless $i & 7; + + $instructs[$i]{orig} = $instructs[$i]{inst}; + $instructs[$i]{inst} =~ s/(?{$1}) ? $regMap->{$1} : $1 /ge; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + my $capData = parseInstruct($inst, $gram) or next; + + foreach my $r (qw(r0 r8 r20 r39)) + { + next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ'; + + my $val = substr $capData->{$r}, 1; + + my @r0 = getVecRegisters($vectors, $capData); + my @r8 = getAddrVecRegisters($vectors, $capData); + + my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1; + my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1; + + if ($val + $regInc > $regCnt) + { + $regCnt = $val + $regInc; + } + } + if ($op eq 'BAR') + { + if (exists $capData->{i8w4}) + { + $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt; + } + elsif (exists $capData->{r8}) + { + $barCnt = 16; + } + } + my ($code, $reuse) = genCode($op, $gram, $capData); + $instructs[$i]{code} = $code; + + if ($gram->{type}{reuse}) + { $instructs[$i]{caps} = $capData; } + else + { $ctrl->{reuse}[($i & 7) - 1] = $reuse; } + + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed); + foreach my $i (0 .. $#instructs) + { + if ($i & 7) + { + push @codes, $instructs[$i]{code}; + my $code_dec= $instructs[$i]{code}; + my $code_hex = sprintf("0x%x", $code_dec); + + if ($instructs[$i]{caps}) + { + registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 7) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn); + } + if ($instructs[$i]{inst} =~ m'EXIT') + { + push @exitOffsets, (scalar(@codes)-1)*8; + } + elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)') + { + push @ctaidOffsets, (scalar(@codes)-1)*8; + $ctaidzUsed = 1 if $1 eq 'Z'; + } + } + else + { + my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)}; + push @codes, + ($ctrl->[0] << 2) | ($ctrl->[1] << 10) | ($ctrl->[2] << 18) | # ctrl codes + ($ctrl->[3] << 26) | ($ctrl->[4] << 34) | ($ctrl->[5] << 42) | + ($ctrl->[6] << 50) | (0x0800000000000000); # reuse codes + } + } + + return { + RegCnt => $regCnt, + BarCnt => $barCnt, + ExitOffsets => \@exitOffsets, + CTAIDOffsets => \@ctaidOffsets, + CTAIDZUsed => $ctaidzUsed, + ConflictCnt => $reuseHistory{conflicts}, + ReuseCnt => $reuseHistory{reuse}, + ReuseTot => $reuseHistory{total}, + ReusePct => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0), + KernelData => \@codes, + }; +} + +sub Test +{ + my ($fh, $printConflicts, $all) = @_; + + my @instructs; + my %reuseHistory; + my ($pass, $fail) = (0,0); + + while (my $line = <$fh>) + { + my (@ctrl, @reuse); + + next unless processSassCtrlLine($line, \@ctrl, \@reuse); + + foreach my $fileReuse (@reuse) + { + $line = <$fh>; + + my $inst = processSassLine($line) or next; + + $inst->{reuse} = $fileReuse; + my $fileCode = $inst->{code}; + + if (exists $relOffset{$inst->{op}}) + { + $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e; + } + + my $match = 0; + foreach my $gram (@{$grammar{$inst->{op}}}) + { + my $capData = parseInstruct($inst->{inst}, $gram) or next; + my @caps; + + my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps); + + registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse}; + + $inst->{caps} = join ', ', sort @caps; + $inst->{codeDiff} = $fileCode ^ $code; + $inst->{reuseDiff} = $fileReuse ^ $reuse; + + if ($code == $fileCode && $reuse == $fileReuse) + { + $inst->{grade} = 'PASS'; + push @instructs, $inst if $all; + $pass++; + } + else + { + $inst->{grade} = 'FAIL'; + push @instructs, $inst; + $fail++; + } + $match = 1; + last; + } + unless ($match) + { + $inst->{grade} = 'FAIL'; + $inst->{codeDiff} = $fileCode; + $inst->{reuseDiff} = $fileReuse; + push @instructs, $inst; + $fail++; + } + } + } + my %maxLen; + foreach (@instructs) + { + $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}}; + } + my ($lastOp, $template); + foreach my $inst (sort { + $a->{op} cmp $b->{op} || + $a->{codeDiff} <=> $b->{codeDiff} || + $a->{reuseDiff} <=> $b->{reuseDiff} || + $a->{ins} cmp $b->{ins} + } @instructs) + { + if ($lastOp ne $inst->{op}) + { + $lastOp = $inst->{op}; + $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s %s\n"; + printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures); + } + printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)}; + } + my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0; + + printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n", + $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total}; + + return $fail; +} + +sub Extract +{ + my ($in, $out, $params) = @_; + + my %paramMap; + my %constants = + ( + blockDimX => 'c[0x0][0x28]', + blockDimY => 'c[0x0][0x2c]', + blockDimZ => 'c[0x0][0x30]', + gridDimX => 'c[0x0][0x34]', + gridDimY => 'c[0x0][0x38]', + gridDimZ => 'c[0x0][0x3c]', + ); + print $out "\n"; + + foreach my $const (sort keys %constants) + { + print $out " $const : $constants{$const}\n"; + $paramMap{$constants{$const}} = $const; + } + print $out "\n"; + + foreach my $p (@$params) + { + my ($ord,$offset,$size,$align) = split ':', $p; + + if ($size > 4) + { + my $num = 0; + $offset = hex $offset; + while ($size > 0) + { + my $param = sprintf 'param_%d[%d]', $ord, $num; + my $const = sprintf 'c[0x0][0x%x]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + $size -= 4; + $offset += 4; + $num += 1; + } + } + else + { + my $param = sprintf 'param_%d', $ord; + my $const = sprintf 'c[0x0][%s]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + } + } + print $out "\n\n"; + + my %labels; + my $labelnum = 1; + + my @data; + FILE: while (my $line = <$in>) + { + my (@ctrl, @ruse); + next unless processSassCtrlLine($line, \@ctrl, \@ruse); + + CTRL: foreach my $ctrl (@ctrl) + { + $line = <$in>; + + my $inst = processSassLine($line) or next CTRL; + + if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)') + { + my $target = hex($1); + + last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num}|| $target == $inst->{num}-8); + + my $label = $labels{$target}; + unless ($label) + { + $label = $labels{$target} = "TARGET$labelnum"; + $labelnum++; + } + $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/; + } + $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg; + + $inst->{ctrl} = printCtrl($ctrl); + + push @data, $inst; + } + } + foreach my $inst (@data) + { + print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}}; + printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)}; + } +} + +my $CommentRe = qr'^[\t ]*.*?^\s*\n?'ms; +my $IncludeRe = qr'^[\t ]*\n?'ms; +my $CodeRe = qr'^[\t ]*(.*?)^\s*<\/CODE\1>\n?'ms; +my $ConstMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $RegMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $ScheduleRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $InlineRe = qr'\[(\+|\-)(.+?)\1\]'ms; + +sub IncludeFile +{ + my ($file, $include) = @_; + my ($vol,$dir,$name) = File::Spec->splitpath($file); + local $/; + my $fh; + if (!open $fh, $file) + { + open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n"; + } + my $content = <$fh>; + close $fh; + return $content; +} + +sub Preprocess +{ + my ($file, $include, $debug, $regMap) = @_; + + my $constMap = {}; + my $removeRegMap; + if ($regMap) + { $removeRegMap = 1; } + else + { $regMap = {}; } + + 1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg; + + $file =~ s|$CommentRe||g; + + 1 while $file =~ s|$CodeRe| + my $out = eval "package KeplerAs::KeplerAs::CODE; $2"; + $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg; + + $file =~ s|$InlineRe| + my ($type, $code) = ($1, $2); + my $out = eval "package KeplerAs::KeplerAs::CODE; $code"; + $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg; + + $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg; + + my @newFile; + foreach my $line (split "\n", $file) + { + if ($line !~ m'^\s*(?:#|//).*') + { + $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg; + } + push @newFile, $line; + } + $file = join "\n", @newFile; + + $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg; + + my @schedBlocks = $file =~ /$ScheduleRe/g; + + foreach my $i (0 .. $#schedBlocks) + { + $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]); + + $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug); + } + + $file =~ s|$ScheduleRe| shift @schedBlocks |eg; + + return $file; +} + +my %srcReg = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X); +my %destReg = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC); +my %regops = (%srcReg, %destReg); +my @itypes = qw(class lat rlat tput dual); + +sub Scheduler +{ + my ($block, $blockNum, $regMap, $debug) = @_; + + my $vectors = $regMap->{__vectors}; + my $lineNum = 0; + + my (@instructs, @comments, $ordered, $first); + foreach my $line (split "\n", $block) + { + $lineNum++; + + unless (preProcessLine($line)) + { + push @comments, $line if $line =~ m'\S'; + next; + } + + if (my $inst = processAsmLine($line, $lineNum)) + { + $inst->{first} = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1; + + $inst->{exeTime} = 0; + $inst->{order} = $ordered++ if $ordered; + push @instructs, $inst; + } + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n"; + } + elsif ($line =~ m'^') + { + die "you cannot use nested tags" if $ordered; + $ordered = 1; + } + elsif ($line =~ m'^') + { + die "missing opening for closing tag" if !$ordered; + $ordered = 0; + } + else + { + die "badly formed line at block: $blockNum line: $lineNum: $line\n"; + } + } + + my (%writes, %reads, @ready, @schedule, $orderedParent); + foreach my $instruct (@instructs) + { + my $match = 0; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $capData = parseInstruct($instruct->{inst}, $gram) or next; + my (@dest, @src); + + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + + push @src, $instruct->{predReg} if $instruct->{pred}; + + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src; + my $mask = hex($capData->{i20w7}); + foreach my $p (0..6) + { + if ($mask & (1 << $p)) + { + push @$list, "P$p"; + } + elsif ($instruct->{op} eq 'R2P') + { + push @src, "P$p"; + } + } + } + + foreach my $operand (grep { exists $regops{$_} } sort keys %$capData) + { + my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src; + + my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT'; + + if ($capData->{$operand} ne $badVal) + { + push @$list, + $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) : + $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) : + $operand eq 'CC' ? 'CC' : + $operand eq 'X' ? 'CC' : + getRegNum($regMap, $capData->{$operand}); + } + } + $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39}); + + foreach my $src (grep { exists $writes{$_} } @src) + { + my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat}; + + foreach my $parent (@{$writes{$src}}) + { + my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat}; + push @{$parent->{children}}, [$instruct, $latency - $regLatency]; + $instruct->{parents}++; + + last unless $parent->{pred}; + } + } + + foreach my $dest (grep { exists $reads{$_} } @dest) + { + foreach my $reader (@{$reads{$dest}}) + { + push @{$reader->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + delete $reads{$dest} unless $instruct->{pred}; + } + + if ($instruct->{order}) + { + if ($orderedParent) + { + push @{$orderedParent->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + $orderedParent = $instruct; + } + elsif ($orderedParent) + { $orderedParent = 0; } + + unshift @{$writes{$_}}, $instruct foreach @dest; + + push @{$reads{$_}}, $instruct foreach @src; + + push @ready, $instruct if !exists $instruct->{parents}; + + $match = 1; + last; + } + die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match; + } + %writes = (); + %reads = (); + + if (@ready) + { + my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" }; + + countUniqueDescendants($readyParent, {}); + updateDepCounts($readyParent, {}); + + @ready = sort { + $a->{first} <=> $b->{first} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + my $clock = 0; + while (my $instruct = shift @ready) + { + my $stall = $instruct->{stall}; + + if (@schedule && $stall < 16) + { + my $prev = $schedule[$#schedule]; + + $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0; + $prev->{ctrl} |= $stall; + $clock += $stall; + } + else + { + $instruct->{ctrl} &= 0x1fff0; + $instruct->{ctrl} |= 1; + $clock += 1; + } + print "$clock: $instruct->{inst}\n" if $debug; + + push @schedule, $instruct; + + if (my $children = $instruct->{children}) + { + foreach (@$children) + { + my ($child, $latency) = @$_; + + my $earliest = $clock + $latency; + $child->{exeTime} = $earliest if $child->{exeTime} < $earliest; + + print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug; + + push @ready, $child if --$child->{parents} < 1; + } + delete $instruct->{children}; + } + + foreach my $ready (@ready) + { + $stall = $ready->{exeTime} - $clock; + $stall = 1 if $stall < 1; + + if ($ready->{class} eq $instruct->{class}) + { + $stall = $ready->{tput} if $stall < $ready->{tput}; + } + elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && + $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const})) + { + $stall = 0; + } + $ready->{stall} = $stall; + + $ready->{mix} = $ready->{class} ne $instruct->{class} || 0; + } + + @ready = sort { + $a->{first} <=> $b->{first} || + $a->{stall} <=> $b->{stall} || + $b->{mix} <=> $a->{mix} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + my $out; + $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule; + return $out; +} + +sub setConstMap +{ + my ($constMap, $constMapText) = @_; + + foreach my $line (split "\n", $constMapText) + { + $line =~ s|^\s+||; + $line =~ s{(?:#|//).*}{}; + $line =~ s|\s+$||; + next unless $line =~ m'\S'; + + my ($name, $value) = split '\s*:\s*', $line; + + $constMap->{$name} = $value; + } + return; +} + +sub setRegisterMap +{ + my ($regMap, $regmapText) = @_; + + my $vectors = $regMap->{__vectors} ||= {}; + my $regBank = $regMap->{__regbank} ||= {}; + my %aliases; + + foreach my $line (split "\n", $regmapText) + { + $line =~ s|^\s+||; + $line =~ s{(?:#|//).*}{}; + $line =~ s|\s+$||; + next unless $line =~ m'\S'; + + my $auto = $line =~ /~/; + my $share = $line =~ /=/; + + my ($regNums, $regNames) = split '\s*[:~=]\s*', $line; + + my (@numList, @nameList, %vecAliases); + foreach my $num (split '\s*,\s*', $regNums) + { + my ($start, $stop) = split '\s*\-\s*', $num; + die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop; + push @numList, ($start .. $stop||$start); + } + foreach my $fullName (split '\s*,\s*', $regNames) + { + if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$') + { + my ($name1, $name2, $bank) = ($1, $3, $4); + foreach (split '\s*\|\s*', $2) + { + my ($start, $stop) = split '\s*\-\s*'; + foreach my $r (map "$name1$_$name2", $start .. $stop||$start) + { + $aliases{$r} = "$name1$name2" unless exists $aliases{$r}; + push @nameList, $r; + $regBank->{$r} = $bank if $auto && defined $bank; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank; + } + } + } + elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$') + { + push @nameList, $1; + $regBank->{$1} = $2 if $auto && defined $2; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2; + } + else + { + die "Bad register name: '$fullName' at: $line\n"; + } + } + die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList; + die "Missmatched register mapping at: $line\n" if $share && @numList > 1; + + my $i = 0; + while ($i < $#numList-1) + { + last if $numList[$i] + 1 != $numList[$i+1]; + $i++; + } + my $ascending = $i+1 == $#numList; + + foreach my $n (0..$#nameList) + { + die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]}; + + if ($auto) + { + $regMap->{$nameList[$n]} = \@numList; + } + elsif ($share) + { + $regMap->{$nameList[$n]} = 'R' . $numList[0]; + } + else + { + $regMap->{$nameList[$n]} = 'R' . $numList[$n]; + if ($ascending && ($numList[$n] & 1) == 0) + { + my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3); + if ($end <= $#nameList) + { + $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ]; + if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}}) + { + $regMap->{$aliases{$nameList[$n]}} = $regMap->{$nameList[$n]}; + $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]}; + delete $aliases{$nameList[$n]}; + } + } + } + } + } + } +} + +sub preProcessLine +{ + $_[0] =~ s|^\s+||; + + my $val = shift; + + $val =~ s{(?:#|//).*}{}; + + return $val =~ m'\S'; +} + +sub countUniqueDescendants +{ + my ($node, $edges) = @_; + + + if (my $children = $node->{children}) + { + foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + + $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges); + } + } + else + { + return $node->{lineNum}; + } + return ($node->{lineNum}, keys %{$node->{deps}}); +} +sub updateDepCounts +{ + my ($node, $edges) = @_; + + + if (my $children = $node->{children}) + { + foreach my $child (@$children) + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + updateDepCounts($child->[0], $edges); + } + } + $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0; +} + +sub registerHealth +{ + my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_; + + my (@banks, @conflicts); + + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $slotHist = $reuseHistory->{$slot} ||= {}; + + $reuseHistory->{total}++; + + if (exists $slotHist->{$r}) + { + $reuseHistory->{reuse}++; + } + else + { + my $bank = substr($r,1) & 7; + + if ($banks[$bank] && $banks[$bank] ne $r) + { + push @conflicts, $banks[$bank] if !@conflicts; + push @conflicts, $r; + + $reuseHistory->{conflicts}++; + } + $banks[$bank] = $r; + } + + if ($reuseFlags & $reuseSlots{$slot}) + { $slotHist->{$r} = 1; } + else + { delete $slotHist->{$r}; } + } + if ($inst && @conflicts && !$nowarn) + { + printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts); + } + return scalar @conflicts; +} + +1; + +__END__ + +=head1 NAME + +KeplerAs::KeplerAs - Assembler for NVIDIA Maxwell architecture + +=head1 SYNOPSIS + + KeplerAs.pl [opts] + +=head1 DESCRIPTION + +See the documentation at: https://github.com/NervanaSystems/KeplerAs + +=head1 SEE ALSO + +See the documentation at: https://github.com/NervanaSystems/KeplerAs + + +=head1 AUTHOR + +Scott Gray, Esgray@nervanasys.com + +=head1 COPYRIGHT AND LICENSE + +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +=cut diff --git a/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAsGrammar.pm b/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAsGrammar.pm new file mode 100644 index 0000000..d372ea3 --- /dev/null +++ b/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAsGrammar.pm @@ -0,0 +1,1659 @@ +package KeplerAs::KeplerAsGrammar; + +use strict; +use Carp; +use Exporter; +use Data::Dumper; +our @ISA = qw(Exporter); + +our @EXPORT = qw( + %grammar %flags + parseInstruct genCode genReuseCode + processAsmLine processSassLine processSassCtrlLine + replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters +); + +require 5.10.0; + +sub getI +{ + my ($orig, $pos, $mask) = @_; + my $val = $orig; + my $neg = $val =~ s|^\-||; + + if ($val =~ m'^(\d+)[xX]<([^>]+)>') + { + my $mul = $1; + my $exp = $2; + $exp =~ s/(?> $trunc) & 0x7ffff if $trunc; + } + return $val << $pos; +} +sub getR +{ + my ($val, $pos) = @_; + if ($val =~ m'^R(\d+|Z)$' && $1 < 255) + { + $val = $1 eq 'Z' ? 0xff : $1; + } + else + { + die "Bad register name found: $val\n"; + } + return $val << $pos; +} +sub getP +{ + my ($val, $pos) = @_; + if ($val =~ m'^P(\d|T)$' && $1 < 7) + { + $val = $1 eq 'T' ? 7 : $1; + } + else + { + die "Bad predicate name found: $val\n"; + } + return $val << $pos; +} +sub getC { ((hex($_[0]) >> 2) & 0x3fff) << 23 } + +my %operands = +( + p0 => sub { getP($_[0], 2) }, + p3 => sub { getP($_[0], 5) }, + p12 => sub { getP($_[0], 14) }, + p29 => sub { getP($_[0], 32) }, + p39 => sub { getP($_[0], 42) }, + p45 => sub { getP($_[0], 48) }, + p48 => sub { getP($_[0], 51) }, + p58 => sub { getP($_[0], 58) }, + r0 => sub { getR($_[0], 2) }, + r8 => sub { getR($_[0], 10) }, + r20 => sub { getR($_[0], 23) }, + r28 => sub { getR($_[0], 28) }, + r39s20 => sub { getR($_[0], 42) }, + r39 => sub { getR($_[0], 42) }, + r39a => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20 + c20 => sub { getC($_[0]) }, + z20 => sub { getC($_[0]) }, + c39 => sub { getC($_[0]) }, + c34 => sub { hex($_[0]) << 37 }, + c36 => sub { hex($_[0]) << 39 }, + f20w32 => sub { getF($_[0], 23, 'f') }, + f20 => sub { getF($_[0], 23, 'f', 12) }, + d20 => sub { getF($_[0], 23, 'd', 44) }, + i8w4 => sub { getI($_[0], 10, 0xf) }, + i20 => sub { getI($_[0], 23, 0x7ffff) }, + i20w6 => sub { getI($_[0], 23, 0x3f) }, + i20w7 => sub { getI($_[0], 23, 0x7f) }, + i20w8 => sub { getI($_[0], 23, 0xff) }, + i20w12 => sub { getI($_[0], 23, 0xfff) }, + i20w24 => sub { getI($_[0], 23, 0xffffff) }, + i20w32 => sub { getI($_[0], 23, 0xffffffff) }, + i31w4 => sub { getI($_[0], 34, 0xf) }, + i34w13 => sub { getI($_[0], 37, 0x1fff) }, + i36w20 => sub { getI($_[0], 36, 0xfffff) }, + i39w8 => sub { getI($_[0], 42, 0x1f) }, + i28w8 => sub { getI($_[0], 28, 0xff) }, + i28w20 => sub { getI($_[0], 31, 0xfffff) }, + i48w8 => sub { getI($_[0], 48, 0xff) }, + i51w5 => sub { getI($_[0], 51, 0x1f) }, + i53w5 => sub { getI($_[0], 53, 0x1f) }, + i23w6 => sub { getI($_[0], 23, 0x3f) }, +); + +my $hex = qr"0[xX][0-9a-fA-F]+"; +my $iAddr = qr"\d+[xX]<[^>]+>"; +my $immed = qr"$hex|$iAddr|\d+"o; +my $reg = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\ +my $p = qr"P[0-6T]"; +my $noPred = qr"(?)"; +my $pred = qr"\@(?\!)?P(?[0-6]) "; +my $p0 = qr"(?$p)"o; +my $p3 = qr"(?$p)"o; +my $p12 = qr"(?\!)?(?$p)"o; +my $p29 = qr"(?\!)?(?$p)"o; +my $p39 = qr"(?\!)?(?$p)"o; +my $p45 = qr"(?$p)"o; +my $p48 = qr"(?$p)"o; +my $p58 = qr"(?$p)"o; +my $r0 = qr"(?$reg)"; +my $r0cc = qr"(?$reg)(?\.CC)?"; +my $r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r28 = qr"(?$reg)"; +my $r39s20 = qr"(?\-)?(?\|)?(?(?$reg))\|?(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39 = qr"(?\-)?(?$reg)(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39a = qr"(?(?$reg))(?\.reuse)?"; +my $c20 = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20x = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20s39 = qr"(?\-)?c\[(?$hex)\]\s*\[(?$hex)\]"o; +my $f20w32 = qr"(?(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))"; +my $f20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $d20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $i8w4 = qr"(?$immed)"o; +my $i20 = qr"(?(?\-)?$immed)(?\.NEG)?"o; +my $i20w6 = qr"(?$immed)"o; +my $i20w7 = qr"(?$immed)"o; +my $i20w8 = qr"(?$immed)"o; +my $i20w12 = qr"(?$immed)"o; +my $i20w24 = qr"(?\-?$immed)"o; +my $i20w32 = qr"(?\-?$immed)"o; +my $i39w8 = qr"(?\-?$immed)"o; +my $i28w8 = qr"(?$immed)"o; +my $i28w20 = qr"(?\-?$immed)"o; +my $i31w4 = qr"(?$immed)"o; +my $i34w13 = qr"(?$immed)"o; +my $i36w20 = qr"(?$immed)"o; +my $i48w8 = qr"(?$immed)"o; +my $i51w5 = qr"(?$immed)"o; +my $i53w5 = qr"(?$immed)"o; +my $i23w6 = qr"(?$immed)"o; +my $ir20 = qr"$i20|$r20"o; +my $cr20 = qr"$c20|$r20"o; +my $icr20 = qr"$i20|$c20|$r20"o; +my $fcr20 = qr"$f20|$c20|$r20"o; +my $cr39 = qr"$c20s39|$r39"o; +my $dr20 = qr"$d20|$r20"o; + +my $u32 = qr"(?\.U32)?"; +my $REV2B = qr"(?\.REV2B)?"; +my $W = qr"(?\.W)?"; +my $pnot2d= qr"(?\.PNOT2D)?"; +my $ftz = qr"(?\.FTZ)?"; +my $sat = qr"(?\.SAT)?"; +my $rnd = qr"(?:\.(?RN|RM|RP|RZ))?"; +my $mulf = qr"(?:\.(?D2|D4|D8|M8|M4|M2))?"; +my $condition = qr"(?:(?F|LT|EQ|LE|GT|NE|GE|NUM|NAN|LTU|EQU|LEU|GTU|NEU|GEU|OFF|LO|SFF|LS|HI|SFT|HS|OFT))?"; +my $lane2a= qr"(?:\.(?LNONE|L0|L1|L01|L2|L02|L12|L012|L3|L03|L13|L013|L23|L023|L123))?"; +my $lane0e= qr"(?:\.(?LNONE|L0|L1|L01|L2|L02|L12|L012|L3|L03|L13|L013|L23|L023|L123))?"; + + +my $round = qr"(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?"; +my $fcmp = qr"(?\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)"; +my $icmp = qr"\.(?LT|EQ|LE|GT|NE|GE)"; +my $bool = qr"\.(?AND|OR|XOR|PASS_B)"; +my $bool2 = qr"\.(?AND|OR|XOR)"; +my $func = qr"\.(?COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)"; +my $rro = qr"\.(?SINCOS|EX2)"; +my $add3 = qr"(?:\.(?X|RS|LS))?"; +my $lopz = qr"(?:\.(?NZ|Z) $p48,|(?))"o; +my $X = qr"(?\.X)?"; +my $PO = qr"(?\.PO)?"; +my $bf = qr"(?\.BF)?"; +my $S = qr"(?\.S)?"; +my $tld = qr"(?NODEP\.)?(?:(?T)|(?P))"; +my $chnls = qr"(?R|RGBA)"; +my $sr = qr"SR_(?\S+)"; +my $shf = qr"(?\.W)?(?:\.(?U64|S64))?(?\.HI)?"; +my $imad = qr"(?:\.(?U32|S32))?(?:\.(?U32|S32))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $imadc = qr"(?:\.(?U32|S32))?(?:\.(?U32|S32))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $imul = qr"(?:\.(?U32|S32))?(?:\.(?U32|S32))?"; +my $vmad8 = qr"\.(?[SU])(?8|16)\.(?[SU])(?8|16)(?\.PO)?(?\.SHR_7)?(?\.SHR_15)?(?\.SAT)?"; +my $vmad16= qr"\.(?[SU])(?16)\.(?[SU])(?16)"; +my $hilo = qr"(?:\.(?XHI|XLO))?"; +my $hi = qr"(?:\.(?HI))?"; +my $vaddType = qr"(?:\.(?UD))?(?:\.(?SD))?(?:\.(?[SU])(?8|16|32))?(?:\.(?[SU])(?8|16|32))?"; +my $vaddMode = qr"(?:\.(?MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?"; +my $vmnmx = qr"(?:\.(?MX))?"; +my $x2x = qr"\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)"; +my $prmt = qr"(?:\.(?F4E|B4E|RC8|ECL|ECR|RC16))?"; +my $shfl = qr"\.(?IDX|UP|DOWN|BFLY)"; +my $bar = qr"\.(?SYNC|ARV|RED)(?:\.(?POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?()|(?))(?(), $p39|(?))"o; +my $b2r = qr"\.RESULT $r0(?:, $p45|(?))"o; +my $dbar = qr"(?SB0|SB1|SB2|SB3|SB4|SB5)"; +my $dbar2 = qr" {(?5)?,?(?4)?,?(?3)?,?(?2)?,?(?1)?,?(?0)?}"; +my $mbar = qr"\.(?CTA|GL|SYS)"; +my $addr = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i20w24)?\]"o; +my $addr2 = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i28w20)?\]"o; +my $ldc = qr"c\[(?$hex)\]\s*$addr"o; +my $atom = qr"(?\.E)?(?:\.(?ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)"; +my $vote = qr"\.(?ALL|ANY|EQ)"o; +my $memType = qr"(?\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)"; +my $memTypeX = qr"(?\.b32|\.b64|\.b96|\.b128)"; +my $memCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CI|CS|CV|IL|WT|LU))?"; +my $ldmemCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|LU|CV))?"; +my $stmemCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CS|WT))?"; + + + + +my $s2rT = {class => 's2r', lat => 2, blat => 25, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $smemT = {class => 'mem', lat => 2, blat => 30, rlat => 2, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $gmemT = {class => 'mem', lat => 2, blat => 200, rlat => 4, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $x32T = {class => 'x32', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 1}; +my $x64T = {class => 'x64', lat => 2, blat => 128, rlat => 0, rhold => 0, tput => 128, dual => 0, reuse => 1}; +my $shftT = {class => 'shift', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $cmpT = {class => 'cmp', lat => 13, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $qtrT = {class => 'qtr', lat => 8, blat => 0, rlat => 4, rhold => 0, tput => 1, dual => 1, reuse => 0}; +my $rroT = {class => 'rro', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $voteT = {class => 'vote', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; + + +our %grammar = +( + FADD => [ + { type => $x32T, code => 0xe2c0000000000002, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc2c0000000000001, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $f20;"o, }, + ], + FADD32I => [ { type => $x32T, code => 0x4000000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o, } ], + FCHK => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o, } ], #Partial? + FCMP => [ + { type => $cmpT, code => 0xdd00000000000002, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $cr20, $r39;"o, }, + { type => $cmpT, code => 0xdd00000000000002, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $r39s20, $c20s39;"o, }, + { type => $cmpT, code => 0xb500000000000001, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $f20, $r39;"o, }, + ], + FFMA => [ + { type => $x32T, code => 0xcc00000000000002, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $cr20, $r39;"o, }, + { type => $x32T, code => 0xcc00000000000002, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0x9400000000000001, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $f20, $r39;"o, }, + ], + FMNMX => [ + { type => $shftT, code => 0xe300000000000002, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $cr20, $p39;"o, }, + { type => $shftT, code => 0xc300000000000001, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $f20, $p39;"o, }, + ], + FMUL => [ + { type => $x32T, code => 0xe340000000000002, rule => qr"^$pred?FMUL$ftz$rnd$sat$mulf $r0, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc340000000000001, rule => qr"^$pred?FMUL$ftz$rnd$sat$mulf $r0, $r8, $f20;"o, }, + ], + FMUL32I => [ { type => $x32T, code => 0x2000000000000002, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o, } ], + FSET => [ + { type => $shftT, code => 0xc000000000000002, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $cr20, $p39;"o, }, + { type => $shftT, code => 0x8000000000000001, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $f20, $p39;"o, }, + ], + FSETP => [ { type => $cmpT, code => 0xdd80000000000002, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], + MUFU => [ { type => $qtrT, code => 0x8400000000000002, rule => qr"^$pred?MUFU$func $r0, $r8;"o, } ], + RRO => [ { type => $rroT, code => 0xe480000000000002, rule => qr"^$pred?RRO$rro $r0, $r20;"o, } ], + DADD => [ + { type => $x64T, code => 0xe380000000000002, rule => qr"^$pred?DADD$rnd $r0, $r8, $cr20;"o, }, + { type => $x64T, code => 0xc380000000000001, rule => qr"^$pred?DADD$rnd $r0, $r8, $d20;"o, }, + ], + DFMA => [ + { type => $x64T, code => 0xdb80000000000002, rule => qr"^$pred?DFMA$rnd $r0, $r8, $cr20, $r39;"o, }, + { type => $x64T, code => 0xdb80000000000002, rule => qr"^$pred?DFMA$rnd $r0, $r8, $d20, $r39;"o, }, + ], + DMNMX => [ + { type => $cmpT, code => 0xe280000000000002, rule => qr"^$pred?DMNMX $r0, $r8, $cr20, $p39;"o, }, + { type => $cmpT, code => 0xe280000000000002, rule => qr"^$pred?DMNMX $r0, $r8, $d20, $p39;"o, }, + ], + DMUL => [ + { type => $x64T, code => 0xe400000000000002, rule => qr"^$pred?DMUL$rnd $r0, $r8, $cr20;"o, }, + { type => $x64T, code => 0xc400000000000001, rule => qr"^$pred?DMUL$rnd $r0, $r8, $d20;"o, }, + ], + DSET => [ { type => $cmpT, code => 0xc800000000000002, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o, } ], + DSETP => [ { type => $cmpT, code => 0xdc00000000000002, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o, } ], + FSWZADD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o, } ], #TODO + + HADD2 => [ { type => $x32T, code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o, } ], + HMUL2 => [ { type => $x32T, code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o, } ], + HFMA2 => [ { type => $x32T, code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o, } ], + HSETP2 => [ { type => $cmpT, code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial + + BFE => [ + { type => $shftT, code => 0xe008000000000002, rule => qr"^$pred?BFE$u32$REV2B $r0, $r8, $cr20;"o, }, + { type => $shftT, code => 0xc008000000000001, rule => qr"^$pred?BFE$u32$REV2B $r0, $r8, $ir20;"o, }, + ], + BFI => [ + { type => $shftT, code => 0xdf80000000000002, rule => qr"^$pred?BFI$S $r0, $r8, $r20, $cr39;"o, }, + { type => $shftT, code => 0xb780000000000001, rule => qr"^$pred?BFI$S $r0, $r8, $i20, $cr39;"o, }, + ], + FLO => [ { type => $s2rT, code => 0xe180000000000002, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o, } ], + IADD => [ + { type => $x32T, code => 0xe080000000000002, rule => qr"^$pred?IADD$S$PO$sat$X $r0cc, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc080000000000001, rule => qr"^$pred?IADD$S$PO$sat$X $r0cc, $r8, $i20;"o, }, + ], + + ISUB => [ + { type => $x32T, code => 0xe088000000000002, rule => qr"^$pred?ISUB$sat$X $r0cc, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc088000000000001, rule => qr"^$pred?ISUB$sat$X $r0cc, $r8, $i20;"o, }, + { type => $x32T, code => 0xc090000000000001, rule => qr"^$pred?ISUB$sat$X $r0cc, $i20, $r8;"o, }, + ], + + + + IADD32I => [ { type => $x32T, code => 0x4000000000000001, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o, } ], + ICMP => [ + { type => $cmpT, code => 0xda08000000000002, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $cr20, $r39;"o, }, + { type => $cmpT, code => 0xda08000000000002, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $r39s20, $c20s39;"o, }, + { type => $cmpT, code => 0xb208000000000001, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $i20, $r39;"o, }, + ], + IMNMX => [ + { type => $shftT, code => 0xe108000000000002, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $cr20, $p39;"o, }, + { type => $shftT, code => 0xc108000000000001, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $i20, $p39;"o, }, + ], + ISET => [ + { type => $shftT, code => 0xda88000000000002, rule => qr"^$pred?ISET$bf$icmp$u32$X$bool$S $r0, $r8, $cr20, $p39;"o, }, + { type => $shftT, code => 0xb288000000000001, rule => qr"^$pred?ISET$bf$icmp$u32$X$bool$S $r0, $r8, $i20, $p39;"o, }, + ], + ISETP => [ + { type => $cmpT, code => 0xdb08000000000002, rule => qr"^$pred?ISETP$icmp$u32$X$bool$S $p3, $p0, $r8, $cr20, $p39;"o, }, + { type => $cmpT, code => 0xb308000000000001, rule => qr"^$pred?ISETP$icmp$u32$X$bool$S $p3, $p0, $r8, $i20, $p39;"o, }, + ], + ISCADD => [ + { type => $shftT, code => 0xe0c0000000000002, rule => qr"^$pred?ISCADD$X $r0cc, $r8, $cr20, $i39w8;"o, }, + { type => $shftT, code => 0xc0c0000000000001, rule => qr"^$pred?ISCADD$X $r0cc, $r8, $i20, $i39w8;"o, } + ], + ISCADD32I => [ { type => $shftT, code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o, } ], + + LOP => [ + { type => $x32T, code => 0xe200000000000002, rule => qr"^$pred?LOP$bool$S $r0, (?~)?$r8, (?~)?$cr20(?\.INV)?;"o, }, + { type => $x32T, code => 0xc200000000000001, rule => qr"^$pred?LOP$bool$S $r0, (?~)?$r8, (?~)?$i20(?\.INV)?;"o, }, + ], + LOP32I => [ { type => $x32T, code => 0x2000000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o, } ], + LOP3 => [ + { type => $x32T, code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o, }, + { type => $x32T, code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o, }, + ], + POPC => [ + { type => $s2rT, code => 0xe040000000000002, rule => qr"^$pred?POPC $r0, $r8, $cr20;"o, }, + { type => $s2rT, code => 0xc040000000000001, rule => qr"^$pred?POPC $r0, $r8, $i20;"o, }, + ], + SHF => [ + { type => $shftT, code => 0xdfc0000000000002, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0xb7c0000000000001, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $i20, $r39;"o, }, + { type => $shftT, code => 0xe7c0000000000002, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0xc7c0000000000001, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $i20, $r39;"o, }, + ], + SHL => [ + { type => $shftT, code => 0xe240000000000002, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $cr20;"o, }, + { type => $shftT, code => 0xc240000000000001, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $i23w6;"o, }, + ], + SHR => [ + { type => $shftT, code => 0xe148000000000002, rule => qr"^$pred?SHR$u32$W $r0, $r8, $cr20;"o, }, + { type => $shftT, code => 0xc148000000000001, rule => qr"^$pred?SHR$u32$W $r0, $r8, $i23w6;"o, }, + ], +IMAD => [ + { type => $x32T, code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $r20, $r39;"o, }, + { type => $x32T, code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $c20x, $r39;"o, }, + { type => $x32T, code => 0xa108000000000001, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $i20, $r39;"o, }, + ], + IMADSP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO + IMUL => [ + { type => $x32T, code => 0xe1c0180000000002, rule => qr"^$pred?IMUL$imul$hi $r0, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc1c0180000000001, rule => qr"^$pred?IMUL$imul$hi $r0, $r8, $i20;"o, }, + ], + IMUL32I => [ + { type => $x32T, code => 0x2e00000000000002, rule => qr"^$pred?IMUL32I$imul$hi $r0, $r8, $i20w32;"o, }, + ], + + F2F => [ { type => $qtrT, code => 0xe540000000000002, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ], + F2I => [ { type => $qtrT, code => 0xe580000000000002, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o, } ], + I2F => [ { type => $qtrT, code => 0xe5c0000000000002, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o, } ], + I2I => [ { type => $qtrT, code => 0xe600000000000002, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o, } ], + F2ITRUNC => [ { type => $qtrT, code => 0xe5800c00051ca846, rule => qr"^$pred?F2ITRUNC[^;]*;"o, } ], + + MOV => [ { type => $x32T, code => 0xe4c03c0000000002, rule => qr"^$pred?MOV$lane2a$S $r0, $cr20;"o, } ], + MOV32I => [ { type => $x32T, code => 0x740000000003c002, rule => qr"^$pred?MOV32I$lane0e$S $r0, (?:$i20w32|$f20w32);"o, } ], + PRMT => [ + { type => $x32T, code => 0xde00000000000002, rule => qr"^$pred?PRMT$prmt $r0, $r8, $cr20, $cr39;"o, }, + { type => $x32T, code => 0xb600000000000001, rule => qr"^$pred?PRMT$prmt $r0, $r8, $i20, $r39;"o, }, + ], + SEL => [ + { type => $x32T, code => 0xe500000000000002, rule => qr"^$pred?SEL $r0, $r8, $cr20, $p39;"o, }, + { type => $x32T, code => 0xc500000000000001, rule => qr"^$pred?SEL $r0, $r8, $i20, $p39;"o, }, + ], + SHFL => [ { type => $smemT, code => 0x7880000000000002, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ], + + PSET => [ { type => $cmpT, code => 0x8440000000000002, rule => qr"^$pred?PSET$bf$bool2$bool $r0, $p12, $p29, $p39;"o, } ], + PSETP => [ { type => $cmpT, code => 0x8480000000000002, rule => qr"^$pred?PSETP$bool2$bool$S $p3, $p0, $p12, $p29, $p39;"o, } ], + CSET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o, } ], #TODO + CSETP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO + P2R => [ { type => $x32T, code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o, } ], + R2P => [ { type => $cmpT, code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o, } ], + + TLD => [ { type => $gmemT, code => 0x700a00067f9ffc02, rule => qr"^$pred?TLD[^;]*;"o, } ], #Partial + TLDzxx => [ { type => $gmemT, code => 0x700a00057f9ffc02, rule => qr"^$pred?TLDzxx[^;]*;"o, } ], #Partial + TEXDEPBAR => [ { type => $gmemT, code => 0x77000000001c0002, rule => qr"^$pred?TEXDEPBAR $i20w6;"o, } ], #Partial + TEX => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o, } ], #TODO + TLD4 => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o, } ], #TODO + TXQ => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o, } ], #TODO + + LD => [ { type => $gmemT, code => 0xc000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr;"o, } ], + LDY => [ { type => $gmemT, code => 0x7f80000000000002, rule => qr"^$pred?LDY $r0, $i20;"o, } ], + LDX => [ { type => $gmemT, code => 0x7ec0000000000002, rule => qr"^$pred?LDX$memTypeX $r0, $addr;"o, } ], + ST => [ { type => $gmemT, code => 0xe000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0;"o, } ], + LDG => [ + { type => $gmemT, code => 0x600010047f800001, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o, }, + ], + LDS => [ { type => $smemT, code => 0x7a40000000000002, rule => qr"^$pred?LDS$memCache$memType$S $r0, $addr;"o, } ], + STS => [ { type => $smemT, code => 0x7ac0000000000002, rule => qr"^$pred?STS$memCache$memType$S $addr, $r0;"o, } ], + LDL => [ { type => $gmemT, code => 0x7a00000000000002, rule => qr"^$pred?LDL$ldmemCache$memType$S $r0, $addr;"o, } ], + STL => [ { type => $gmemT, code => 0x7a80000000000002, rule => qr"^$pred?STL$stmemCache$memType$S $addr, $r0;"o, } ], + LDC => [ { type => $gmemT, code => 0x7c800000000ffc02, rule => qr"^$pred?LDC$memCache$memType$S $r0, $ldc;"o, } ], + ATOM => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + RED => [ { type => $gmemT, code => 0x68000000000003fe, rule => qr"^$pred?RED$atom $addr2, $r20;"o, } ], + CCTL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o, } ], #TODO + CCTLL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO + + SULD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o, } ], #TODO + + BRA => [ + { type => $x32T, code => 0x120000000000003c, rule => qr"^$pred?BRA(?\.U)? $i20w24;"o, }, + { type => $x32T, code => 0x1200000000000000, rule => qr"^$pred?BRA(?\.U)? CC\.$condition, $i20w24;"o, }, + ], + + BRX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o, } ], #TODO + JMP => [ + { type => $x32T, code => 0x108000000000003c, rule => qr"^$pred?JMP(?\.U)? $i20w32;"o, }, + { type => $x32T, code => 0x1080000000000000, rule => qr"^$pred?JMP(?\.U)? CC\.$condition, $i20w32;"o, }, + ], + JMX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o, } ], #TODO + SSY => [ { type => $x32T, code => 0x1480000000000000, rule => qr"^$noPred?SSY $i20w24;"o, } ], + + CAL => [ { type => $x32T, code => 0x1300000000000100, rule => qr"^$noPred?CAL $i20w24;"o, } ], + JCAL => [ { type => $x32T, code => 0x1100000000000100, rule => qr"^$noPred?JCAL $i20w32;"o, } ], + PRET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o, } ], #TODO + RET => [ + { type => $x32T, code => 0x190000000000003c, rule => qr"^$pred?RET;"o, }, + { type => $x32T, code => 0x1900000000000000, rule => qr"^$pred?RET CC\.$condition;"o, }, + ], + BRK => [ { type => $x32T, code => 0x1a0000000000003c, rule => qr"^$pred?BRK;"o, } ], + PBK => [ { type => $x32T, code => 0x1500000000000000, rule => qr"^$noPred?PBK $i20w24;"o, } ], + CONT => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o, } ], + PCNT => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o, } ], + EXIT => [ + { type => $x32T, code => 0x18000000001c003c, rule => qr"^$pred?EXIT;"o, }, + { type => $x32T, code => 0x18000000001c0000, rule => qr"^$pred?EXIT CC\.$condition;"o, }, + ], + BPT => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o, } ], + + NOP => [ { type => $x32T, code => 0x8580000000003c02, rule => qr"^$pred?NOP$S;"o, } ], + S2R => [ { type => $s2rT, code => 0x8640000000000002, rule => qr"^$pred?S2R$S $r0, $sr;"o, } ], + B2R => [ { type => $x32T, code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o, } ], + BAR => [ + { type => $gmemT, code => 0x8540dc0000000002, rule => qr"^$pred?BAR.SYNC $i8w4;"o, }, + { type => $gmemT, code => 0x8540dc0000000002, rule => qr"^$pred?BAR.SYNC $i8w4, $i20w12;"o, }, + { type => $gmemT, code => 0x85409c0000000002, rule => qr"^$pred?BAR.SYNC $i8w4, $r20;"o, }, + { type => $gmemT, code => 0x85405c0000000002, rule => qr"^$pred?BAR.SYNC $r8;"o, }, + { type => $gmemT, code => 0x85405c0000000002, rule => qr"^$pred?BAR.SYNC $r8, $i20w12;"o, }, + { type => $gmemT, code => 0x85401c0000000002, rule => qr"^$pred?BAR.SYNC $r8, $r20;"o, }, + { type => $gmemT, code => 0x8540dc0800000002, rule => qr"^$pred?BAR.ARV $i8w4, $i20w12;"o, }, + { type => $gmemT, code => 0x85409c0800000002, rule => qr"^$pred?BAR.ARV $i8w4, $r20;"o, }, + { type => $gmemT, code => 0x85405c0800000002, rule => qr"^$pred?BAR.ARV $r8, $i20w12;"o, }, + { type => $gmemT, code => 0x85401c0800000002, rule => qr"^$pred?BAR.ARV $r8, $r20;"o, }, + ], + DEPBAR => [ + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, }, + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o, }, + ], + MEMBAR => [ { type => $x32T, code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o, } ], + + VOTE => [ + { type => $voteT, code => 0x86c0000000000002, rule => qr"^$pred?VOTE$vote (?:$r0, |(?))$p45, $p39;"o, } ], + + + VADD => [ { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMAD => [ + { type => $x32T, code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $i20, $r39;"o, }, + { type => $x32T, code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, }, + ], + VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMNMX => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + + VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 +); + +my @flags = grep /\S/, split "\n", q{; + +BFE, BFI, FLO, IADD, ISUB, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD +0x0800000000000000 neg + +FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP +0x0800000000000000 neg + +PSET, PSETP +0x0000000000020000 p12not +0x0000000800000000 p29not + +FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE +0x0000200000000000 p39not + +IADD32I +0x0010000000000000 CC + +IMAD, PSET, FSET, DSET, ISET, IADD, ISUB, IMUL, ISCADD +0x0004000000000000 CC + +IMAD: mode +0x0200000000000000 HI + +IMAD +0x0010000000000000 X + +IMUL: mode +0x0000040000000000 HI + +IMUL32I: mode +0x0100000000000000 HI + +FFMA, FADD, FCMP, FMUL, FMNMX, FSWZ, FSET, FSETP, FCHK, RRO, MUFU, DFMA, DADD, DMUL, DMNMX, DSET, DSETP, IMAD, IMADSP, IMUL, IADD, ISCADD, ISAD, IMNMX, BFE, BFI, SHR, SHL, SHF, LOP, FLO, ISET, ISETP, ICMP, POPC, F2F, F2I, I2F, I2I, MOV, MOV32I, SEL, PRMT, SHFL, P2R, R2P, CSET, CSETP, PSET, PSETP, TEX, TLD, TLD4, TXQ, LDC, LD, LDG, LDL, LDS, LDSLK, ST, STL, STS, STSCUL, ATOM, RED, CCTL, CCTLL, MEMBAR, SUCLAMP, SUBFM, SUEAU, SULDGA, SUSTGA, BRA, BRX, RET, BRK, CONT, NOP, S2R, B2R, BAR, VOTE, MOV +0x0000000000400000 S + +SHF +0x0020000000000000 W +0x0001000000000000 HI + +SHF: type +0x0000020000000000 U64 +0x0000010000000000 S64 + +IMAD, ICMP, ISET, ISETP, ISAD, SHR, IMNMX, FLO, BFE +0x0008000000000000 U32 + +SHR, SHL +0x0000040000000000 W + +SHFL +0x0000000080000000 i20w8 +0x0000000100000000 i34w13 + +SHFL: mode +0x0000000000000000 IDX +0x0000000200000000 UP +0x0000000300000000 DOWN +0x0000000600000000 BFLY + +IMNMX: mode +0x0000080000000000 XLO +0x0000180000000000 XHI + +ISETP, ISET, ICMP: cmp +0x0010000000000000 LT +0x0020000000000000 EQ +0x0030000000000000 LE +0x0040000000000000 GT +0x0050000000000000 NE +0x0060000000000000 GE + +ISETP, ISET, PSETP, PSET, FSET, FSETP, DSET, DSETP: bool +0x0000000000000000 AND +0x0001000000000000 OR +0x0002000000000000 XOR + +PSETP, PSET: bool2 +0x0000000000000000 AND +0x0000000008000000 OR +0x0000000010000000 XOR + +ISETP, ISET, IADD, ISUB +0x0000400000000000 X + +ISCADD +0x0020000000000000 X + +ISET, PSET +0x0000800000000000 BF + +LOP: bool +0x0000000000000000 AND +0x0000100000000000 OR +0x0000200000000000 XOR +0x0000300000000000 PASS_B + +LOP, POPC, FLO +0x0000080000000000 INV + +LOP, POPC, IADD, ISUB +0x0000040000000000 INV1 + +LOP: z +0x0000200000000000 Z +0x0000300000000000 NZ + +LOP +0x0000000000000000 noz + +LOP32I: bool +0x0000000000000000 AND +0x0020000000000000 OR +0x0040000000000000 XOR + +PRMT: mode +0x0008000000000000 F4E +0x0010000000000000 B4E +0x0018000000000000 RC8 +0x0020000000000000 ECL +0x0028000000000000 ECR +0x0030000000000000 RC16 + +IMAD: type1 +0x0008000000000000 U32 +0x0008000000000000 S32 + +IMAD: type2 +0x0100000000000000 U32 +0x0100000000000000 S32 + +IMUL: type1 +0x0000080000000000 U32 +0x0000000000000000 S32 + +IMUL: type2 +0x0000100000000000 U32 +0x0000000000000000 S32 + +IMUL32I: type1 +0x0200000000000000 U32 +0x0000000000000000 S32 + +IMUL32I: type2 +0x0400000000000000 U32 +0x0000000000000000 S32 + +XMAD: type1 +0x0000000000000000 U16 +0x0001000000000000 S16 + +XMAD: type2 +0x0000000000000000 U16 +0x0002000000000000 S16 + +XMAD: mode +0x0000002000000000 MRG +0x0000001000000000 PSL +0x0008000000000000 CHI +0x0004000000000000 CLO +0x000c000000000000 CSFU + +XMAD: modec +0x0004000000000000 CLO +0x0008000000000000 CHI +0x000c000000000000 CSFU +0x0040000000000000 X +0x0080000000000000 PSL +0x0100000000000000 MRG + +XMAD +0x0010000000000000 CBCC + +XMAD: r8part +0x0000000000000000 H0 +0x0020000000000000 H1 + +XMAD: r20part +0x0000000000000000 H0 +0x0000000800000000 H1 + +XMAD: r20partx +0x0000000000000000 H0 +0x0010000000000000 H1 + +XMAD: r39part +0x0000000000000000 H0 +0x0010000000000000 H1 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part +0x0000000000000000 B0 +0x0000001000000000 B1 +0x0000002000000000 B2 +0x0000003000000000 B3 +0x0000001000000000 H1 +0x0000000000000000 H0 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part +0x0000000000000000 B0 +0x0000000010000000 B1 +0x0000000020000000 B2 +0x0000000030000000 B3 +0x0000000010000000 H1 +0x0000000000000000 H0 + +VMAD +0x0040000000000000 r8neg +0x0020000000000000 r39neg +0x0008000000000000 SHR_7 +0x0010000000000000 SHR_15 +0x0060000000000000 PO +0x0080000000000000 SAT + +VMNMX +0x0100000000000000 MX + +VADD, VABSDIFF, VMNMX +0x0080000000000000 SAT +0x0040000000000000 UD +0x0040000000000000 SD + +VSET: cmp +0x0040000000000000 LT +0x0080000000000000 EQ +0x00c0000000000000 LE +0x0100000000000000 GT +0x0140000000000000 NE +0x0180000000000000 GE + +VADD, VSET: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VABSDIFF: mode +0x0003000000000000 ACC +0x000b000000000000 MIN +0x0013000000000000 MAX +0x0023000000000000 MRG_16H +0x002b000000000000 MRG_16L +0x0033000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x003b000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMNMX: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1 +0x0000000000000000 U +0x0004000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2 +0x0000000000000000 U +0x0008000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size1 +0x0000000000000000 8 +0x0000004000000000 16 +0x0000006000000000 32 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size2 +0x0000000000000000 8 +0x0000000000000000 16 +0x0000000000000000 32 + +IADD3: type +0x0001000000000000 X +0x0000002000000000 RS +0x0000004000000000 LS + +IADD3: r8part +0x0000000000000000 H0 +0x0000001000000000 H1 + +IADD3: r20part +0x0000000080000000 H0 + +IADD3: r39part +0x0000000200000000 H0 + +IADD3 +0x0008000000000000 r8neg +0x0004000000000000 r20neg +0x0002000000000000 r39neg + +IADD, ISUB, ISCADD +0x0010000000000000 r8neg +0x0008000000000000 r20neg +0x0018000000000000 PO + +IADD32I +0x0100000000000000 X +0x0800000000000000 r8neg + +IMAD +0x0080000000000000 r8neg + +IMAD +0x0040000000000000 r39neg + +DEPBAR: SB +0x0000000000000000 SB0 +0x0000000004000000 SB1 +0x0000000008000000 SB2 +0x000000000c000000 SB3 +0x0000000010000000 SB4 +0x0000000014000000 SB5 + +DEPBAR: cmp +0x0000000020000000 LE + +DEPBAR +0x0000000000000001 db0 +0x0000000000000002 db1 +0x0000000000000004 db2 +0x0000000000000008 db3 +0x0000000000000010 db4 +0x0000000000000020 db5 + +F2F, F2I, I2F, I2I: destWidth +0x0000000000000000 8 +0x0000000000000400 16 +0x0000000000000800 32 +0x0000000000000c00 64 + +F2F, F2I, I2F, I2I: srcWidth +0x0000000000000000 8 +0x0000000000001000 16 +0x0000000000002000 32 +0x0000000000003000 64 + +F2F, F2I, I2F, I2I: destSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000008000 S + +F2F, F2I, I2F, I2I: srcSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000008000 S + +F2I, I2F, I2I: r20part +0x0000000000000000 H0 +0x0000040000000000 H1 +0x0000000000000000 B0 +0x0000020000000000 B1 +0x0000040000000000 B2 +0x0000060000000000 B3 + +F2F: r20part +0x0000000000000000 H0 +0x0000020000000000 H1 + +F2F: round +0x0000040000000000 ROUND +0x0000048000000000 FLOOR +0x0000050000000000 CEIL +0x0000058000000000 TRUNC + +F2I: round +0x0000000000000000 ROUND +0x0000040000000000 FLOOR +0x0000080000000000 CEIL +0x00000c0000000000 TRUNC + +HADD2, HMUL2: r8part +0x0001000000000000 H0_H0 +0x0000000000000000 H1_H1 + +HFMA2: r20part +0x0000000020000000 H0_H0 +0x0000000030000000 H1_H1 + +FADD, DADD, FMUL, DMUL, F2F, I2F: rnd +0x0000000000000000 RN +0x0000040000000000 RM +0x0000080000000000 RP +0x00000c0000000000 RZ + +FMUL: mulf +0x0000100000000000 D2 +0x0000200000000000 D4 +0x0000300000000000 D8 +0x0000400000000000 M8 +0x0000500000000000 M4 +0x0000600000000000 M2 + +BRA, JMP, RET, EXIT: CON +0x0000000000000000 F +0x0000000000000004 LT +0x0000000000000008 EQ +0x000000000000000c LE +0x0000000000000010 GT +0x0000000000000014 NE +0x0000000000000018 GE +0x000000000000001c NUM +0x0000000000000020 NAN +0x0000000000000024 LTU +0x0000000000000028 EQU +0x000000000000002c LEU +0x0000000000000030 GTU +0x0000000000000034 NEU +0x0000000000000038 GEU +0x0000000000000040 OFF +0x0000000000000044 LO +0x0000000000000048 SFF +0x000000000000004c LS +0x0000000000000050 HI +0x0000000000000054 SFT +0x0000000000000058 HS +0x000000000000005c OFT + +MOV: lane2a +0x0000380000000000 LNONE +0x0000340000000000 L0 +0x0000300000000000 L1 +0x00002c0000000000 L01 +0x0000280000000000 L2 +0x0000240000000000 L02 +0x0000200000000000 L12 +0x00001c0000000000 L3 +0x0000180000000000 L03 +0x0000140000000000 L13 +0x0000100000000000 L013 +0x00000c0000000000 L23 +0x0000080000000000 L023 +0x0000040000000000 L123 + +MOV32I: lane0e +0x0000000000038000 LNONE +0x0000000000034000 L0 +0x0000000000030000 L1 +0x000000000002c000 L01 +0x0000000000028000 L2 +0x0000000000024000 L02 +0x0000000000020000 L12 +0x000000000001c000 L3 +0x0000000000018000 L03 +0x0000000000014000 L13 +0x0000000000010000 L013 +0x000000000000c000 L23 +0x0000000000008000 L023 +0x0000000000004000 L123 + +DFMA: rnd +0x0000000000000000 RN +0x0004000000000000 RM +0x0008000000000000 RP +0x000c000000000000 RZ + +FFMA: rnd +0x0000000000000000 RN +0x0040000000000000 RM +0x0080000000000000 RP +0x00c0000000000000 RZ + +FFMA, FMUL32I +0x0100000000000000 FTZ + +F2F, F2I, FADD, FMUL, FMNMX +0x0000800000000000 FTZ + +FADD32I +0x0080000000000000 FTZ + +FMUL32I +0x0020000000000000 FTZ + +FSET, FSETP, FCMP, DSET, DSETP +0x0400000000000000 FTZ + +HADD2, HMUL2 +0x0000008000000000 FTZ + +HFMA2 +0x0000002000000000 FTZ + +FADD, FFMA, FMUL, F2F, I2I, MUFU, IMAD, IADD, ISUB +0x0020000000000000 SAT + +FADD, DADD, FMNMX, DMNMX, MUFU, FFMA, DFMA, FMUL, DADD, DMUL +0x0008000000000000 r8neg + +FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I +0x0001000000000000 r20neg + +FMUL, DMUL, FFMA, DFMA +0x0001000000000000 r20neg + +FFMA, DFMA +0x0010000000000000 r39neg + +FADD, DADD, FMNMX, DMNMX, MUFU +0x0002000000000000 r8abs + +FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I +0x0010000000000000 r20abs + +FSETP, DSETP, FSET, DSET +0x0000400000000000 r8neg +0x0100000000000000 r20neg +0x0200000000000000 r8abs +0x0000800000000000 r20abs + +RRO: func +0x0000000000000000 SINCOS +0x0000040000000000 EX2 + +MUFU: func +0x0000000000000000 COS +0x0000000000800000 SIN +0x0000000001000000 EX2 +0x0000000001800000 LG2 +0x0000000002000000 RCP +0x0000000002800000 RSQ +0x0000000003000000 RCP64H +0x0000000003800000 RSQ64H + +FSETP, DSETP, FSET, DSET, FCMP: cmp +0x0008000000000000 .LT +0x0010000000000000 .EQ +0x0018000000000000 .LE +0x0020000000000000 .GT +0x0020000000000000 +0x0028000000000000 .NE +0x0030000000000000 .GE +0x0038000000000000 .NUM +0x0040000000000000 .NAN +0x0048000000000000 .LTU +0x0050000000000000 .EQU +0x0058000000000000 .LEU +0x0060000000000000 .GTU +0x0068000000000000 .NEU +0x0070000000000000 .GEU + +FSETP, DSETP, FSET, DSET: bool +0x0000000000000000 AND +0x0001000000000000 OR +0x0002000000000000 XOR + +HSETP2: cmp +0x0000002800000000 .NE + +HSETP2: bool +0x0000000000000000 AND + +S2R: sr +0x0000000000000000 LANEID +0x0000000001000000 VIRTCFG +0x0000000001800000 VIRTID +0x0000000002000000 PM0 +0x0000000002800000 PM1 +0x0000000003000000 PM2 +0x0000000003800000 PM3 +0x0000000004000000 PM4 +0x0000000004800000 PM5 +0x0000000005000000 PM6 +0x0000000005800000 PM7 +0x0000000008000000 PRIM_TYPE +0x0000000008800000 INVOCATION_ID +0x0000000009000000 Y_DIRECTION +0x0000000010000000 TID +0x0000000010800000 TID.X +0x0000000011000000 TID.Y +0x0000000011800000 TID.Z +0x0000000012000000 CTA_PARAM +0x0000000012800000 CTAID.X +0x0000000013000000 CTAID.Y +0x0000000013800000 CTAID.Z +0x0000000014000000 NTID +0x0000000014800000 CirQueueIncrMinusOne +0x0000000015000000 NLATC +0x0000000015800000 43 +0x0000000016000000 44 +0x0000000016800000 45 +0x0000000017000000 46 +0x0000000017800000 47 +0x0000000018000000 SWINLO +0x0000000018800000 SWINSZ +0x0000000019000000 SMEMSZ +0x0000000019800000 SMEMBANKS +0x000000001a000000 LWINLO +0x000000001a800000 LWINSZ +0x000000001b000000 LMEMLOSZ +0x000000001b800000 LMEMHIOFF +0x000000001c000000 EQMASK +0x000000001c800000 LTMASK +0x000000001d000000 LEMASK +0x000000001d800000 GTMASK +0x000000001e000000 GEMASK +0x0000000020000000 GLOBALERRORSTATUS +0x0000000021000000 WARPERRORSTATUS +0x0000000028000000 CLOCKLO +0x0000000029000000 GLOBALTIMERLO +0x0000000029800000 GLOBALTIMERHI + +CS2R: sr +0x0000000005000000 CLOCKLO +0x0000000005100000 CLOCKHI +0x0000000005200000 GLOBALTIMERLO +0x0000000005300000 GLOBALTIMERHI + +B2R +0x0000e00000000000 nop45 + +BAR: red +0x0000000000000000 POPC +0x0000000800000000 AND +0x0000001000000000 OR + +MEMBAR: mode +0x0000000000000000 CTA +0x0000000000000100 GL +0x0000000000000200 SYS + +VOTE: mode +0x0000000000000000 ALL +0x0008000000000000 ANY +0x0010000000000000 EQ + +VOTE +0x00000000000003fc nor0 + +BRA +0x0000000000000200 U + +TLDS: chnls +0x0010000000000000 RGBA + +TLDS +0x0002000000000000 NODEP + +LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS +0x0000000000000000 nor8 + +LD, ST: type +0x0000000000000000 .U8 +0x0100000000000000 .S8 +0x0200000000000000 .U16 +0x0300000000000000 .S16 +0x0400000000000000 +0x0400000000000000 .32 +0x0500000000000000 .64 +0x0600000000000000 .128 + +LDX: type +0x0000000000000000 .b32 +0x0004000000000000 .b64 +0x0008000000000000 .b96 +0x000c000000000000 .b128 + +LD, ST: cache +0x0000000000000000 CG +0x1000000000000000 CS +0x1800000000000000 CV +0x1800000000000000 WT + +STG, LDS, STS, LDL, STL, LDC: type +0x0000000000000000 .U8 +0x0008000000000000 .S8 +0x0010000000000000 .U16 +0x0018000000000000 .S16 +0x0020000000000000 +0x0020000000000000 .32 +0x0028000000000000 .64 +0x0030000000000000 .128 + +LDG: type +0x0000000000000000 .U8 +0x0000800000000000 .S8 +0x0001000000000000 .U16 +0x0001800000000000 .S16 +0x0002000000000000 +0x0002000000000000 .32 +0x0002800800000000 .64 +0x0003003800000000 .128 + +LDG, STG: cache +0x0000000000000000 CG +0x0000000000000000 CI +0x0000040000000000 CS +0x0000000000000000 CV +0x0000000000000000 WT + +LDG +0x0000008000000000 E + +LDL: cache +0x0000200000000000 CI + +LDL, STL: cache +0x0000800000000000 CG +0x0001000000000000 LU +0x0001800000000000 CV +0x0001800000000000 WT + +LDC: cache +0x0000100000000000 IL + +STG, LDS, STS, LDL, STL, LDC +0x0000200000000000 E + +LDS +0x0008000000000000 U + +RED: type +0x0000000000000000 +0x0010000000000000 .S32 +0x0020000000000000 .U64 +0x0030000000000000 .F32.FTZ.RN +0x0040000000000000 .F16x2.FTZ.RN +0x0050000000000000 .S64 + +RED: mode +0x0000000000000000 ADD +0x0080000000000000 MIN +0x0100000000000000 MAX +0x0180000000000000 INC +0x0200000000000000 DEC +0x0280000000000000 AND +0x0300000000000000 OR +0x0380000000000000 XOR + +ATOM: type +0x0000000000000000 +0x0002000000000000 .S32 +0x0004000000000000 .U64 +0x0006000000000000 .F32.FTZ.RN +0x0008000000000000 .F16x2.FTZ.RN +0x000a000000000000 .S64 +0x0002000000000000 .64 + +ATOM, RED +0x0008000000000000 E + +LD, ST +0x0080000000000000 E + +ATOM: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x03f0000000000000 CAS + +ATOMS: type +0x0000000000000000 +0x0000000010000000 .S32 +0x0000000020000000 .U64 +0x0000000030000000 .S64 +0x0010000000000000 .64 + +ATOMS: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x0240000000000000 CAS + +BFE:REV2B +0x0000080000000000 REV2B +}; + +our %flags; +my (@ops, $flag); +foreach my $line (@flags) +{ + if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)') + { + my $val = hex($1); + if ($flag) + { $flags{$_}{$flag}{$2} = $val foreach @ops; } + else + { $flags{$_}{$2} = $val foreach @ops; } + } + else + { + my ($ops, $name) = split ':\s*', $line; + @ops = split ',\s*', $ops; + $flag = $name; + } +} + +sub parseInstruct +{ + my ($inst, $grammar) = @_; + return unless $inst =~ $grammar->{rule}; + my %capData = %+; + return \%capData; +} + +my %immedOps = map { $_ => 1 } qw(i20 f20 d20); +my %immedCodes = +( + 0x5c => 0x64, + 0x5b => 0x6d, + 0x59 => 0x6b, + 0x58 => 0x68, +); +my %constCodes = +( + c20 => 0x2, + c39 => 0x1, +); +my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4); + +sub genReuseCode +{ + my $capData = shift; + my $reuse = 0; + $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes; + return $reuse; +} + +sub genCode +{ + my ($op, $grammar, $capData, $test) = @_; + + my $flags = $flags{$op}; + my $code = $grammar->{code}; + my $reuse = 0; + + + if (exists $capData->{noPred}) + { + delete $capData->{noPred}; + push @$test, 'noPred' if $test; + } + else + { + my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7; + push @$test, 'predNum' if $test; + if (exists $capData->{predNot}) + { + $p |= 8; + push @$test, 'predNot' if $test; + } + $code |= $p << 18; + delete @{$capData}{qw(predNum predNot)}; + + } + foreach my $rcode (qw(reuse1 reuse2 reuse3)) + { + if (delete $capData->{$rcode}) + { + $reuse |= $reuseCodes{$rcode}; + push @$test, $rcode if $test; + } + } + + foreach my $capture (keys %$capData) + { + if (exists $constCodes{$capture}) + { $code ^= $constCodes{$capture} << 62; } + + if (exists $operands{$capture}) + { + unless ($capture eq 'r20' && exists $capData->{r39s20}) + { + $code ^= $operands{$capture}->($capData->{$capture}); + push @$test, $capture if $test; + } + } + + if (exists $flags->{$capture}) + { + if (ref $flags->{$capture}) + { + $code ^= $flags->{$capture}{$capData->{$capture}}; + push @$test, "$capture:$capData->{$capture}" if $test; + } + else + { + $code ^= $flags->{$capture}; + push @$test, $capture if $test; + } + } + elsif (!exists $operands{$capture} && !$test) + { + warn "UNUSED: $op: $capture: $capData->{$capture}\n"; + warn Dumper($flags); + } + } + + return $code, $reuse; +} + + +my $CtrlRe = qr'(?[T\-]:[G\-]:[D\-]:[S\-]:[0-9]{2})'; +my $PredRe = qr'(?@!?(?P\d)\s+)'; +my $InstRe = qr"$PredRe?(?\w+)(?[^;]*;)"o; +my $CommRe = qr'(?.*)'; + +sub processAsmLine +{ + my ($line, $lineNum) = @_; + + if ($line =~ m"^$CtrlRe(?\s+)$InstRe$CommRe"o) + { + return { + lineNum => $lineNum, + pred => $+{pred}, + predReg => $+{predReg}, + space => $+{space}, + op => $+{op}, + comment => $+{comment}, + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + ctrl => readCtrl($+{ctrl}, $line), + }; + } + return undef; +} + +sub processSassLine +{ + my $line = shift; + + if ($line =~ m"^\s+/\*(?[0-9a-f]+)\*/\s+$InstRe\s+/\* (?0x[0-9a-f]+)"o) + { + return { + num => hex($+{num}), + pred => $+{pred}, + op => $+{op}, + ins => normalizeSpacing($+{op} . $+{rest}), + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + code => hex($+{code}), + }; + } + return undef; +} + +sub processSassCtrlLine +{ + my ($line, $ctrl, $ruse) = @_; + + return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)'; + + my $code = hex($1); + if (ref $ctrl) + { + push @$ctrl, ($code & 0x00000000000003fc) >> 2; + push @$ctrl, ($code & 0x000000000003fc00) >> 10; + push @$ctrl, ($code & 0x0000000003fc0000) >> 18; + push @$ctrl, ($code & 0x00000003fc000000) >> 26; + push @$ctrl, ($code & 0x000003fc00000000) >> 34; + push @$ctrl, ($code & 0x0003fc0000000000) >> 42; + push @$ctrl, ($code & 0x03fc000000000000) >> 50; + } + if (ref $ruse) + { + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + push @$ruse, ($code & 0x7800000000000000) >> 59; + } + return 1; +} + +sub replaceXMADs +{ + my $file = shift; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD\.LO\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s +%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s; +%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c x comment)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?-?$immed|\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/ + + die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + return $file; +} +sub normalizeSpacing +{ + my $inst = shift; + $inst =~ s/\t/ /g; + $inst =~ s/\s{2,}/ /g; + return $inst; +} + + +sub printCtrl +{ + my $code = shift; + + my $stall = ($code & 0x0f) >> 0; + my $sharedbar = ($code & 0x10) >> 4; + my $dual_issue = ($code & 0x20) >> 5; + my $globalbar = ($code & 0x40) >> 6; + my $texbar = ($code & 0x80) >> 7; + + $texbar = $texbar ? 'T' : '-'; + $globalbar = $globalbar ? 'G' : '-'; + $dual_issue = $dual_issue ? '-' : 'D'; + $sharedbar = $sharedbar ? 'S' : '-'; + $stall = sprintf('%02d', $stall); + return sprintf '%s:%s:%s:%s:%02d', $texbar, $globalbar, $dual_issue, $sharedbar, $stall; +} +sub readCtrl +{ + my ($ctrl, $context) = @_; + my ($texbar, $globalbar, $dual_issue, $sharedbar, $stall) = split ':', $ctrl; + + $texbar= $texbar eq 'T' ? 1 : 0; + $globalbar= $globalbar eq 'G' ? 1 : 0; + $dual_issue= $dual_issue eq 'D' ? 0 : 1; + $sharedbar= $sharedbar eq 'S' ? 1 : 0; + $stall = sprintf("%d", $stall); + + + + return + $texbar << 7 | + $globalbar << 6 | + $dual_issue << 5 | + $sharedbar << 4 | + $stall; +} + +sub getRegNum +{ + my ($regMap, $regName) = @_; + + return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName}; +} + +sub getVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r0} or return; + + return if $regName eq 'RZ'; + + if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+3); + } + confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4; + return @{$vectors->{$regName}}; + } + return $regName; +} + +sub getAddrVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r8} or return; + + return if $regName eq 'RZ'; + + if (exists $capData->{E}) + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + print Dumper($vectors) unless exists $vectors->{$regName}; + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + return $regName; +} + +__END__ + + + diff --git a/Assembler/KeplerAs/blib/lib/auto/KeplerAs/KeplerAs/.exists b/Assembler/KeplerAs/blib/lib/auto/KeplerAs/KeplerAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/KeplerAs/blib/man1/.exists b/Assembler/KeplerAs/blib/man1/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/KeplerAs/blib/man3/.exists b/Assembler/KeplerAs/blib/man3/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/KeplerAs/blib/man3/KeplerAs::KeplerAs.3pm b/Assembler/KeplerAs/blib/man3/KeplerAs::KeplerAs.3pm new file mode 100644 index 0000000..e7e96d8 --- /dev/null +++ b/Assembler/KeplerAs/blib/man3/KeplerAs::KeplerAs.3pm @@ -0,0 +1,117 @@ +.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29) +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" Set up some character translations and predefined strings. \*(-- will +.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left +.\" double quote, and \*(R" will give a right double quote. \*(C+ will +.\" give a nicer C++. Capital omega is used to do unbreakable dashes and +.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, +.\" nothing in troff, for use with C<>. +.tr \(*W- +.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' +.ie n \{\ +. ds -- \(*W- +. ds PI pi +. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch +. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch +. ds L" "" +. ds R" "" +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds -- \|\(em\| +. ds PI \(*p +. ds L" `` +. ds R" '' +. ds C` +. ds C' +'br\} +.\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" +.\" If the F register is turned on, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.\" +.\" Avoid warning from groff about undefined register 'F'. +.de IX +.. +.nr rF 0 +.if \n(.g .if rF .nr rF 1 +.if (\n(rF:(\n(.g==0)) \{ +. if \nF \{ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. if !\nF==2 \{ +. nr % 0 +. nr F 2 +. \} +. \} +.\} +.rr rF +.\" ======================================================================== +.\" +.IX Title "KeplerAs::KeplerAs 3pm" +.TH KeplerAs::KeplerAs 3pm "2018-11-05" "perl v5.22.1" "User Contributed Perl Documentation" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh +.SH "NAME" +KeplerAs::KeplerAs \- Assembler for NVIDIA Maxwell architecture +.SH "SYNOPSIS" +.IX Header "SYNOPSIS" +.Vb 1 +\& KeplerAs.pl [opts] +.Ve +.SH "DESCRIPTION" +.IX Header "DESCRIPTION" +See the documentation at: https://github.com/NervanaSystems/KeplerAs +.SH "SEE ALSO" +.IX Header "SEE ALSO" +See the documentation at: https://github.com/NervanaSystems/KeplerAs +.SH "AUTHOR" +.IX Header "AUTHOR" +Scott Gray, +.SH "COPYRIGHT AND LICENSE" +.IX Header "COPYRIGHT AND LICENSE" +The \s-1MIT\s0 License (\s-1MIT\s0) +.PP +Copyright (c) 2014 Scott Gray +.PP +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the \*(L"Software\*(R"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +.PP +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +.PP +\&\s-1THE SOFTWARE IS PROVIDED \*(L"AS IS\*(R", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE.\s0 diff --git a/Assembler/KeplerAs/blib/script/.exists b/Assembler/KeplerAs/blib/script/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/KeplerAs/blib/script/KeplerAs.pl b/Assembler/KeplerAs/blib/script/KeplerAs.pl new file mode 100755 index 0000000..268cc85 --- /dev/null +++ b/Assembler/KeplerAs/blib/script/KeplerAs.pl @@ -0,0 +1,275 @@ +#!/usr/bin/perl +use strict; +use KeplerAs::Cubin; +use KeplerAs::KeplerAs; +use Data::Dumper; +use File::Spec; + +require 5.10.0; + +$Data::Dumper::Sortkeys = 1; + +my $mode = shift; + +if ($mode =~ /^\-?\-l/i) +{ + my $cubinFile = shift or usage(); + + my $cubin = KeplerAs::Cubin->new($cubinFile); + + my $arch = $cubin->arch; + my $class = $cubin->class; + my $asize = $cubin->address_size; + my $kernels = $cubin->listKernels; + my $symbols = $cubin->listSymbols; + + printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize; + + foreach my $ker (sort keys %$kernels) + { + printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)}; + } + foreach my $sym (sort keys %$symbols) + { + printf "Symbol: %s\n", $sym; + } +} +elsif ($mode =~ /^\-?\-t/i) +{ + my $reg = shift if $ARGV[0] =~ /^\-?\-r/i; + my $all = shift if $ARGV[0] =~ /^\-?\-a/i; + my $file = shift or usage(); + my $fh; + if (-T $file) + { + open $fh, $file or die "$file: $!"; + } + else + { + my $cubin = KeplerAs::Cubin->new($file); + my $arch = $cubin->arch; + + open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!"; + my $first = <$fh>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + } + exit(KeplerAs::KeplerAs::Test($fh, $reg, $all) ? 1 : 0); +} +elsif ($mode =~ /^\-?\-e/i) +{ + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $cubinFile = shift or usage(); + my $asmFile = shift; + my $cubin = KeplerAs::Cubin->new($cubinFile); + my $arch = $cubin->arch; + my $kernels = $cubin->listKernels; + + $kernelName ||= (sort keys %$kernels)[0]; + + my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName"; + + open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_35 -sass -fun $kernelName $cubinFile: $!"; + my $first = <$in>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n"; + + print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt)); + + print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n"; + + print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params}; + + print $out "#\n# Instructions:\n\n"; + + KeplerAs::KeplerAs::Extract($in, $out, $kernel->{Params}); + + close $out if $asmFile; + close $in; +} +elsif ($mode =~ /^\-?\-s/i) +{ + my $sassFile = shift or usage(); + my $asmFile = shift; + + open my $in, $sassFile or die "$sassFile: $!"; + + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + KeplerAs::KeplerAs::Extract($in, $out, []); + + close $out if $asmFile; + close $in; +} +elsif ($mode =~ /^\-?\-i/i) +{ + my $nowarn; + if ($ARGV[0] =~ /^\-?\-w/i) + { + $nowarn = shift; + } + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $noReuse = shift if $ARGV[0] =~ /^\-?\-n/i; + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package KeplerAs::KeplerAs::CODE; our \$$name = '$value';" + } + + my $asmFile = shift or usage(); + my $cubinFile = shift or usage(); + my $newCubin = shift || $cubinFile; + + my $file; + if (open my $fh, $asmFile) + { + local $/; + $file = <$fh>; + close $fh; + } + else { die "$asmFile: $!" } + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName; + die "asm file missing kernel name or is badly formatted" unless $kernelName; + + my $kernel = KeplerAs::KeplerAs::Assemble($file, $include, !$noReuse, $nowarn); + + my $cubin = KeplerAs::Cubin->new($cubinFile); + $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName"; + + $cubin->modifyKernel(%$kernel); + + $cubin->write($newCubin); + + printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n", + @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)}; + +} +elsif ($mode =~ /^\-?\-p/i) +{ + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package KeplerAs::KeplerAs::CODE; our \$$name = '$value';"; + } + my $debug = shift if $ARGV[0] =~ /^\-?\-d/i; + my $asmFile = shift or usage(); + my $asmFile2 = shift; + + die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2; + + open my $fh, $asmFile or die "$asmFile: $!"; + local $/; + my $file = <$fh>; + close $fh; + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + if ($asmFile2) + { + open $fh, ">$asmFile2" or die "$asmFile2: $!"; + } + else + { + $fh = \*STDOUT; + } + print $fh KeplerAs::KeplerAs::Preprocess($file, $include, $debug); + close $fh; +} +elsif ($mode =~ /^\-?\-v/i) +{ + print "$KeplerAs::KeplerAs::VERSION\n"; +} +else +{ + print "$mode\n"; + usage(); +} + +exit(0); + + + +sub usage +{ + print < + + Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes. + Also useful for extending the missing grammar rules. Defaults to only showing failures without --all. + With the --reg flag it will show register bank conflicts not hidden by reuse flags. + + KeplerAs.pl --test|-t [--reg|-r] [--all|-a] + + Extract a single kernel into an asm file from a cubin. + Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin. + + KeplerAs.pl --extract|-e [--kernel|-k kernel_name] [asm_file] + + Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes. + Include the debug flag to print out detailed scheduler info. + + KeplerAs.pl --pre|-p [--debug|-d] [new_asm_file] + + Insert the kernel asm back into the cubin. Overwrite existing or create new cubin. + Optionally you can skip register reuse flag auto insertion. This allows you to observe + performance without any reuse or you can use it to set the flags manually in your sass. + + KeplerAs.pl --insert|-i [--noreuse|-n] [new_cubin_file] + + Display version information and exit: + + KeplerAs.pl --version|-v + +EOF + exit(1); +} + +__END__ diff --git a/Assembler/KeplerAs/lib/KeplerAs/Cubin.pm b/Assembler/KeplerAs/lib/KeplerAs/Cubin.pm new file mode 100644 index 0000000..867342d --- /dev/null +++ b/Assembler/KeplerAs/lib/KeplerAs/Cubin.pm @@ -0,0 +1,604 @@ +package KeplerAs::Cubin; + +use strict; +use Data::Dumper; + +my @Elf32_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + L entry + L phOffset + L shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf64_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + Q entry + Q phOffset + Q shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf32_PrgHdr = qw( + L type + L offset + L vaddr + L paddr + L fileSize + L memSize + L flags + L align +); +my @Elf64_PrgHdr = qw( + L type + L flags + Q offset + Q vaddr + Q paddr + Q fileSize + Q memSize + Q align +); +my @Elf32_SecHdr = qw( + L name + L type + L flags + L addr + L offset + L size + L link + L info + L align + L entSize +); +my @Elf64_SecHdr = qw( + L name + L type + Q flags + Q addr + Q offset + Q size + L link + L info + Q align + Q entSize +); +my @Elf32_SymEnt = qw( + L name + L value + L size + C info + C other + S shIndx +); +my @Elf64_SymEnt = qw( + L name + C info + C other + S shIndx + Q value + Q size +); +my @symBind = qw(LOCAL GLOBAL WEAK); + +my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC); + +$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr; +$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr; +$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr; +$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt; + +$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr; +$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr; +$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr; +$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt; + +$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr ]; +$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ]; +$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ]; +$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ]; + +$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr ]; +$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ]; +$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ]; +$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ]; + +sub new +{ + my ($package, $file) = @_; + + my $cubin = bless { fileName => $file }, $package; + + open my $fh, $file or die "$file: $!"; + binmode($fh); + + my $data; + read $fh, $data, 0x34; + my $elfHdr = $cubin->{elfHdr} = {}; + @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data; + + my $class = $elfHdr->{fileClass}; + + if ($class == 2) + { + seek $fh, 0, 0; + read $fh, $data, 0x46; + @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data; + + $cubin->{Class} = 64; + } + else + { + $cubin->{Class} = 32; + } + + $cubin->{Arch} = "35"; + die "Cubin not in sm_35. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} != 35; + $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32; + + seek $fh, $elfHdr->{phOffset}, 0; + foreach (1 .. $elfHdr->{phNum}) + { + read $fh, $data, $elfHdr->{phEntSize}; + + my %prgHdr = (Indx => $_ - 1); + @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data; + push @{$cubin->{prgHdrs}}, \%prgHdr; + } + + seek $fh, $elfHdr->{shOffset}, 0; + foreach (1 .. $elfHdr->{shNum}) + { + read $fh, $data, $elfHdr->{shEntSize}; + + my %secHdr = (Indx => $_ - 1); + @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data; + push @{$cubin->{secHdrs}}, \%secHdr; + } + + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $data = ''; + if ($secHdr->{size} && $secHdr->{type} != 8) + { + seek $fh, $secHdr->{offset}, 0; + read $fh, $data, $secHdr->{size}; + } + if ($secHdr->{type} == 3) # STRTAB + { + my $strTab = $secHdr->{StrTab} = {}; + my $indx = 0; + foreach my $str (split "\0", $data) + { + $strTab->{$indx} = $str; + $indx += 1 + length($str); + } + } + if ($secHdr->{type} == 2) # SYMTAB + { + my $offset = 0; + while ($offset < $secHdr->{size}) + { + my $symEnt = {}; + @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize}); + $offset += $secHdr->{entSize}; + + push @{$secHdr->{SymTab}}, $symEnt; + } + } + $secHdr->{Data} = unpack 'H*', $data; + } + close $fh; + + my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab}; + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $secHdr->{Name} = $shStrTab->{$secHdr->{name}}; + $cubin->{$secHdr->{Name}} = $secHdr; + } + + my $strTab = $cubin->{'.strtab'}{StrTab}; + foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}}) + { + $symEnt->{Name} = $strTab->{$symEnt->{name}}; + + my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}]; + $secHdr->{SymbolEnt} = $symEnt; + + if (($symEnt->{info} & 0x0f) == 0x02) + { + my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr; + + $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4]; + + $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ]; + + $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20; + + $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24; + + my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"}; + $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0; + + $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"}; + + my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"}; + if ($paramSec) + { + my @data = unpack "L*", pack "H*", $paramSec->{Data}; + + $paramSec->{ParamData} = \@data; + $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ]; + + my $idx = 0; + $idx++ while $idx < @data && $data[$idx] != 0x00080a04; + + my $first = $data[$idx+2] & 0xFFFF; + $idx += 4; + + my @params; + while ($idx < @data && $data[$idx] == 0x000c1704) + { + my $ord = $data[$idx+2] & 0xFFFF; + my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16); + my $psize = $data[$idx+3] >> 18; + my $align = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0; + unshift @params, "$ord:$offset:$psize:$align"; + $idx += 4; + } + my @staticParams = @data[0 .. ($idx-1)]; + + + my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize); + while ($idx < @data) + { + my $code = $data[$idx] & 0xffff; + my $size = $data[$idx] >> 16; + $idx++; + + + if ($code == 0x1b03) + { + $maxregCount = $size; + } + elsif ($code == 0x1d04) + { + while ($size > 0) + { + push @ctaidOffsets, $data[$idx++]; + $size -= 4; + } + } + elsif ($code == 0x1c04) + { + while ($size > 0) + { + push @exitOffsets, $data[$idx++]; + $size -= 4; + } + } + elsif ($code == 0x0401) + { + $ctaidzUsed = 1; + } + elsif ($code == 0x1004) + { + while ($size > 0) + { + push @reqntid, $data[$idx++]; + $size -= 4; + } + } + elsif ($code == 0x0504) + { + while ($size > 0) + { + push @maxntid, $data[$idx++]; + $size -= 4; + } + } + elsif ($code == 0x1e04) + { + while ($size > 0) + { + push @stackSize, $data[$idx++]; + $size -= 4; + } + } + else + { + printf "Unknown Code 0x%02x (size:%d)\n", $code, $size; + } + } + $kernelSec->{Params} = \@params; + $kernelSec->{ParamCnt} = scalar @params; + + $paramSec->{StaticParams} = \@staticParams; + $paramSec->{MAXREG_COUNT} = $maxregCount; + $paramSec->{ExitOffsets} = \@exitOffsets; + $paramSec->{CTAIDOffsets} = \@ctaidOffsets; + $paramSec->{CTAIDZUsed} = $ctaidzUsed; + $paramSec->{REQNTID} = \@reqntid; + $paramSec->{MAXNTID} = \@maxntid; + $paramSec->{STACKSIZE} = \@stackSize; + } + } + elsif (($symEnt->{info} & 0x10) == 0x10) + { + $cubin->{Symbols}{$symEnt->{Name}} = $symEnt; + } + } + + + return $cubin; +} +sub class +{ + return shift()->{Class}; +} +sub arch +{ + return shift()->{Arch}; +} +sub address_size +{ + return shift()->{AddressSize}; +} +sub listKernels +{ + return shift()->{Kernels}; +} +sub listSymbols +{ + return shift()->{Symbols}; +} +sub getKernel +{ + my ($cubin, $kernel) = @_; + return $cubin->{Kernels}{$kernel}; +} + +sub modifyKernel +{ + my ($cubin, %params) = @_; + + my $kernelSec = $params{Kernel}; + my $newReg = $params{RegCnt}; + my $newBar = $params{BarCnt}; + my $exitOffsets = $params{ExitOffsets}; + my $ctaidOffsets = $params{CTAIDOffsets}; + my $ctaidzUsed = $params{CTAIDZUsed}; + my $newData = $params{KernelData}; + my $newSize = @$newData * 8; + + die "255 register max" if $newReg > 255; + die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63; + die "16 is max barrier count" if $newBar > 16; + + my $paramSec = $kernelSec->{ParamSec}; + my $kernelName = $kernelSec->{SymbolEnt}{Name}; + my $maxregCount = $paramSec->{MAXREG_COUNT}; + my $stackSize = $paramSec->{STACKSIZE}; + + $kernelSec->{KernelData} = $newData; + $kernelSec->{Data} = unpack "H*", pack "Q*", @$newData; + + if ($newReg != $kernelSec->{RegCnt}) + { + print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n"; + $kernelSec->{RegCnt} = $newReg; + $kernelSec->{info} &= ~0xff000000; + $kernelSec->{info} |= $newReg << 24; + } + if ($newBar != $kernelSec->{BarCnt}) + { + print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n"; + $kernelSec->{BarCnt} = $newBar; + $kernelSec->{flags} &= ~0x01f00000; + $kernelSec->{flags} |= $newBar << 20; + } + + my @paramData = @{$paramSec->{StaticParams}}; + if (defined $maxregCount) + { + push @paramData, ($maxregCount << 16) | 0x1b03; + } + + + my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets; + my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}}; + + if ($newCTAIDs ne $oldCTAIDs) + { + print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n"; + } + if (@$ctaidOffsets) + { + push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04; + push @paramData, @$ctaidOffsets; + } + + my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets; + my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}}; + + if ($newExits ne $oldExits) + { + print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n"; + } + if (@$exitOffsets) + { + push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04; + push @paramData, @$exitOffsets; + } + + if ($ctaidzUsed != $paramSec->{CTAIDZUsed}) + { + print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n"; + } + if ($ctaidzUsed) + { + push @paramData, 0x0401; + } + + if (@{$paramSec->{REQNTID}}) + { + push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004; + push @paramData, @{$paramSec->{REQNTID}}; + } + if (@{$paramSec->{MAXNTID}}) + { + push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504; + push @paramData, @{$paramSec->{MAXNTID}}; + } + if (@$stackSize) + { + push @paramData, (scalar(@$stackSize) << 18) | 0x1e04; + push @paramData, @$stackSize; + } + + my $newParamSize = scalar(@paramData)*4; + $paramSec->{Data} = unpack "H*", pack "L*", @paramData; + if ($newParamSize != $paramSec->{size}) + { + print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n"; + $cubin->updateSize($paramSec, $newParamSize); + } + + if ($newSize != $kernelSec->{size}) + { + print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n"; + $cubin->updateSize($kernelSec, $newSize, 1); + } +} + +sub updateSize +{ + my ($cubin, $sec, $newSize, $updatePrgSize) = @_; + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + my $delta = $newSize - $sec->{size}; + $sec->{size} = $newSize; + + if ($sec->{SymbolEnt}) + { + $sec->{SymbolEnt}{size} = $newSize; + my $symSection = $cubin->{'.symtab'}; + $symSection->{Data} = ''; + foreach my $symEnt (@{$symSection->{SymTab}}) + { + $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}}; + } + } + + my $pos = $elfHdr->{ehSize}; + my %sizeMap; + + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + next if $secHdr->{align} == 0; + + my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size}; + + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pos += $secHdr->{align} - $pad; + } + $sizeMap{$secHdr->{offset}} = $pos; + + $secHdr->{offset} = $pos; + + $pos += $size; + } + + my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset}; + + $sizeMap{$elfHdr->{shOffset}} = $pos; + $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize; + + $elfHdr->{shOffset} = $pos; + $elfHdr->{phOffset} = $pos + $shSize; + + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}}; + + if ($updatePrgSize && $prgHdr->{type} == 1 && + $sec->{offset} >= $prgHdr->{offset} && + $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta) + { + $prgHdr->{fileSize} += $delta; + $prgHdr->{memSize} += $delta; + } + } +} + +sub write +{ + my ($cubin, $file) = @_; + + open my $fh, ">$file" or die "Error: could not open $file for writing: $!"; + binmode($fh); + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}}; + my $pos = $elfHdr->{ehSize}; + + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + next if $secHdr->{size} == 0 || $secHdr->{type} == 8; + + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pad = $secHdr->{align} - $pad; + print $fh join '', "\0" x $pad; + $pos += $pad; + } + + print $fh pack 'H*', $secHdr->{Data}; + $pos += $secHdr->{size}; + } + + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}}; + } + + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}}; + } + close $fh; +} + +__END__ + diff --git a/Assembler/KeplerAs/lib/KeplerAs/KeplerAs.pm b/Assembler/KeplerAs/lib/KeplerAs/KeplerAs.pm new file mode 100644 index 0000000..34dfbcd --- /dev/null +++ b/Assembler/KeplerAs/lib/KeplerAs/KeplerAs.pm @@ -0,0 +1,1196 @@ +package KeplerAs::KeplerAs; + +require 5.10.0; + +use strict; +use Data::Dumper; +use KeplerAs::KeplerAsGrammar; +use File::Spec; +use Carp; + +our $VERSION = '1.06'; + +my %relOffset = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT); + +my %absOffset = map { $_ => 1 } qw(JCAL); + +my %jumpOp = (%relOffset, %absOffset); + +my %noDest = map { $_ => 1 } qw(ST STG STS STL RED); + +my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4); + +sub Assemble +{ + my ($file, $include, $doReuse, $nowarn) = @_; + + my $regMap = {}; + $file = Preprocess($file, $include, 0, $regMap); + my $vectors = delete $regMap->{__vectors}; + my $regBank = delete $regMap->{__regbank}; + + my $regCnt = 0; + my $barCnt = 0; + + my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse); + + push @instructs, $ctrl = {}; + + foreach my $line (split "\n", $file) + { + $lineNum++; + + next unless preProcessLine($line); + + if (my $inst = processAsmLine($line, $lineNum)) + { + + push @branches, @instructs+0 if exists $jumpOp{$inst->{op}}; + + push @{$ctrl->{ctrl}}, $inst->{ctrl}; + + $inst->{ctrl} = $ctrl; + + push @instructs, $inst; + push @instructs, $ctrl = {} if ((@instructs & 7) == 0); + } + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + $labels{$1} = @instructs+0; + } + else + { + die "badly formed line at $lineNum: $line\n"; + } + } + push @{$ctrl->{ctrl}}, 0x00; + push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' }; + while (@instructs & 7) + { + push @instructs, $ctrl = {} if ((@instructs & 7) == 0); + push @{$ctrl->{ctrl}}, 0x00; + push @instructs, { op => 'NOP', inst => 'NOP;' }; + } + + foreach my $i (@branches) + { + if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1}) + { die "instruction has invalid label: $instructs[$i]{inst}"; } + + $instructs[$i]{jump} = $labels{$1}; + + if (exists $relOffset{$instructs[$i]{op}}) + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; } + else + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; } + } + + foreach my $i (0 .. $#instructs) + { + next unless $i & 7; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + my $capData = parseInstruct($inst, $gram) or next; + + if ($doReuse) + { + my @r0 = getVecRegisters($vectors, $capData); + + + if (@r0 && !exists $noDest{$op}) + { + foreach my $slot (keys %reuseSlots) + { + if (my $reuse = $reuse{$slot}) + { + delete $reuse->{$_} foreach @r0; + } + } + } + %reuse = () if exists $jumpOp{$op}; + + if ($gram->{type}{reuse}) + { + foreach my $slot (keys %reuseSlots) + { + next unless exists $capData->{$slot}; + + my $r = $capData->{$slot}; + next if $r eq 'RZ'; + next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction + + my $reuse = $reuse{$slot} ||= {}; + + if (my $p = $reuse->{$r}) + { + $instructs[$p]{ctrl}{reuse}[($p & 7) - 1] |= $reuseSlots{$slot}; + + } + elsif (keys %$reuse > 2) + { + my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0]; + delete $reuse->{$oldest}; + } + $reuse->{$r} = $i; + } + } + } + elsif ($gram->{type}{reuse}) + { + $ctrl->{reuse}[($i & 7) - 1] = genReuseCode($capData); + } + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + foreach my $r (sort keys %$regBank) + { + my $bank = $regBank->{$r}; + my $avail = $regMap->{$r}; + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 7)) + { + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + last; + } + } + } + + my (%liveTime, %pairedBanks, %reuseHistory); + foreach my $i (0 .. $#instructs) + { + next unless $i & 7; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + my $capData = parseInstruct($inst, $gram) or next; + my $reuseType = $gram->{type}{reuse}; + + my (%addReuse, %delReuse); + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r}; + + if (my $liveTime = $liveTime{$liveR}) + { + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + + my $slotHist = $reuseHistory{$slot} ||= {}; + my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0; + + + if (!$selfReuse && ref $regMap->{$r}) + { + foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39)) + { + my $r2 = $capData->{$slot2}; + next if $r2 eq 'RZ' || $r2 eq $r; + + my $slotHist2 = $reuseHistory{$slot2} ||= {}; + + + if (!$reuseType || !exists $slotHist2->{$r2}) + { + if (ref $regMap->{$r2}) + { + push @{$pairedBanks{$r}{pairs}}, $r2; + $pairedBanks{$r}{banks} ||= []; + } + else + { + my $bank = substr($regMap->{$r2},1) & 7; + + $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++; + $pairedBanks{$r}{pairs} ||= []; + } + $pairedBanks{$r}{useCnt}++; + } + } + } + if ($reuseType) + { + if ($ctrl->{reuse}[($i & 7) - 1] & $reuseSlots{$slot}) + { $addReuse{$slot} = $r; } + else + { $delReuse{$slot} = $r; } + } + } + $reuseHistory{$_}{$addReuse{$_}} = 1 foreach keys %addReuse; + delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse; + + foreach my $r0 (getVecRegisters($vectors, $capData)) + { + my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0}; + + if (exists $noDest{$op}) + { + if (my $liveTime = $liveTime{$liveR}) + { + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r0): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + } + elsif (my $liveTime = $liveTime{$liveR}) + { + if ($i > $liveTime->[$#$liveTime][1]) + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + else + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + foreach my $r (sort { + $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} || + $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} || + $a cmp $b + } keys %pairedBanks) + { + my $banks = $pairedBanks{$r}{banks}; + my $avail = $regMap->{$r}; + + + BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..7)) + { + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 7)) + { + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + + $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}}; + last BANK; + } + } + } + } + foreach my $r (sort keys %$regMap) + { + if (ref($regMap->{$r}) eq 'ARRAY') + { + $regMap->{$r} = 'R' . shift @{$regMap->{$r}}; + } + } + + foreach my $i (0 .. $#instructs) + { + next unless $i & 7; + + $instructs[$i]{orig} = $instructs[$i]{inst}; + $instructs[$i]{inst} =~ s/(?{$1}) ? $regMap->{$1} : $1 /ge; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + my $capData = parseInstruct($inst, $gram) or next; + + foreach my $r (qw(r0 r8 r20 r39)) + { + next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ'; + + my $val = substr $capData->{$r}, 1; + + my @r0 = getVecRegisters($vectors, $capData); + my @r8 = getAddrVecRegisters($vectors, $capData); + + my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1; + my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1; + + if ($val + $regInc > $regCnt) + { + $regCnt = $val + $regInc; + } + } + if ($op eq 'BAR') + { + if (exists $capData->{i8w4}) + { + $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt; + } + elsif (exists $capData->{r8}) + { + $barCnt = 16; + } + } + my ($code, $reuse) = genCode($op, $gram, $capData); + $instructs[$i]{code} = $code; + + if ($gram->{type}{reuse}) + { $instructs[$i]{caps} = $capData; } + else + { $ctrl->{reuse}[($i & 7) - 1] = $reuse; } + + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed); + foreach my $i (0 .. $#instructs) + { + if ($i & 7) + { + push @codes, $instructs[$i]{code}; + my $code_dec= $instructs[$i]{code}; + my $code_hex = sprintf("0x%x", $code_dec); + + if ($instructs[$i]{caps}) + { + registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 7) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn); + } + if ($instructs[$i]{inst} =~ m'EXIT') + { + push @exitOffsets, (scalar(@codes)-1)*8; + } + elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)') + { + push @ctaidOffsets, (scalar(@codes)-1)*8; + $ctaidzUsed = 1 if $1 eq 'Z'; + } + } + else + { + my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)}; + push @codes, + ($ctrl->[0] << 2) | ($ctrl->[1] << 10) | ($ctrl->[2] << 18) | # ctrl codes + ($ctrl->[3] << 26) | ($ctrl->[4] << 34) | ($ctrl->[5] << 42) | + ($ctrl->[6] << 50) | (0x0800000000000000); # reuse codes + } + } + + return { + RegCnt => $regCnt, + BarCnt => $barCnt, + ExitOffsets => \@exitOffsets, + CTAIDOffsets => \@ctaidOffsets, + CTAIDZUsed => $ctaidzUsed, + ConflictCnt => $reuseHistory{conflicts}, + ReuseCnt => $reuseHistory{reuse}, + ReuseTot => $reuseHistory{total}, + ReusePct => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0), + KernelData => \@codes, + }; +} + +sub Test +{ + my ($fh, $printConflicts, $all) = @_; + + my @instructs; + my %reuseHistory; + my ($pass, $fail) = (0,0); + + while (my $line = <$fh>) + { + my (@ctrl, @reuse); + + next unless processSassCtrlLine($line, \@ctrl, \@reuse); + + foreach my $fileReuse (@reuse) + { + $line = <$fh>; + + my $inst = processSassLine($line) or next; + + $inst->{reuse} = $fileReuse; + my $fileCode = $inst->{code}; + + if (exists $relOffset{$inst->{op}}) + { + $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e; + } + + my $match = 0; + foreach my $gram (@{$grammar{$inst->{op}}}) + { + my $capData = parseInstruct($inst->{inst}, $gram) or next; + my @caps; + + my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps); + + registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse}; + + $inst->{caps} = join ', ', sort @caps; + $inst->{codeDiff} = $fileCode ^ $code; + $inst->{reuseDiff} = $fileReuse ^ $reuse; + + if ($code == $fileCode && $reuse == $fileReuse) + { + $inst->{grade} = 'PASS'; + push @instructs, $inst if $all; + $pass++; + } + else + { + $inst->{grade} = 'FAIL'; + push @instructs, $inst; + $fail++; + } + $match = 1; + last; + } + unless ($match) + { + $inst->{grade} = 'FAIL'; + $inst->{codeDiff} = $fileCode; + $inst->{reuseDiff} = $fileReuse; + push @instructs, $inst; + $fail++; + } + } + } + my %maxLen; + foreach (@instructs) + { + $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}}; + } + my ($lastOp, $template); + foreach my $inst (sort { + $a->{op} cmp $b->{op} || + $a->{codeDiff} <=> $b->{codeDiff} || + $a->{reuseDiff} <=> $b->{reuseDiff} || + $a->{ins} cmp $b->{ins} + } @instructs) + { + if ($lastOp ne $inst->{op}) + { + $lastOp = $inst->{op}; + $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s %s\n"; + printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures); + } + printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)}; + } + my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0; + + printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n", + $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total}; + + return $fail; +} + +sub Extract +{ + my ($in, $out, $params) = @_; + + my %paramMap; + my %constants = + ( + blockDimX => 'c[0x0][0x28]', + blockDimY => 'c[0x0][0x2c]', + blockDimZ => 'c[0x0][0x30]', + gridDimX => 'c[0x0][0x34]', + gridDimY => 'c[0x0][0x38]', + gridDimZ => 'c[0x0][0x3c]', + ); + print $out "\n"; + + foreach my $const (sort keys %constants) + { + print $out " $const : $constants{$const}\n"; + $paramMap{$constants{$const}} = $const; + } + print $out "\n"; + + foreach my $p (@$params) + { + my ($ord,$offset,$size,$align) = split ':', $p; + + if ($size > 4) + { + my $num = 0; + $offset = hex $offset; + while ($size > 0) + { + my $param = sprintf 'param_%d[%d]', $ord, $num; + my $const = sprintf 'c[0x0][0x%x]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + $size -= 4; + $offset += 4; + $num += 1; + } + } + else + { + my $param = sprintf 'param_%d', $ord; + my $const = sprintf 'c[0x0][%s]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + } + } + print $out "\n\n"; + + my %labels; + my $labelnum = 1; + + my @data; + FILE: while (my $line = <$in>) + { + my (@ctrl, @ruse); + next unless processSassCtrlLine($line, \@ctrl, \@ruse); + + CTRL: foreach my $ctrl (@ctrl) + { + $line = <$in>; + + my $inst = processSassLine($line) or next CTRL; + + if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)') + { + my $target = hex($1); + + last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num}|| $target == $inst->{num}-8); + + my $label = $labels{$target}; + unless ($label) + { + $label = $labels{$target} = "TARGET$labelnum"; + $labelnum++; + } + $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/; + } + $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg; + + $inst->{ctrl} = printCtrl($ctrl); + + push @data, $inst; + } + } + foreach my $inst (@data) + { + print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}}; + printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)}; + } +} + +my $CommentRe = qr'^[\t ]*.*?^\s*\n?'ms; +my $IncludeRe = qr'^[\t ]*\n?'ms; +my $CodeRe = qr'^[\t ]*(.*?)^\s*<\/CODE\1>\n?'ms; +my $ConstMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $RegMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $ScheduleRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $InlineRe = qr'\[(\+|\-)(.+?)\1\]'ms; + +sub IncludeFile +{ + my ($file, $include) = @_; + my ($vol,$dir,$name) = File::Spec->splitpath($file); + local $/; + my $fh; + if (!open $fh, $file) + { + open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n"; + } + my $content = <$fh>; + close $fh; + return $content; +} + +sub Preprocess +{ + my ($file, $include, $debug, $regMap) = @_; + + my $constMap = {}; + my $removeRegMap; + if ($regMap) + { $removeRegMap = 1; } + else + { $regMap = {}; } + + 1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg; + + $file =~ s|$CommentRe||g; + + 1 while $file =~ s|$CodeRe| + my $out = eval "package KeplerAs::KeplerAs::CODE; $2"; + $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg; + + $file =~ s|$InlineRe| + my ($type, $code) = ($1, $2); + my $out = eval "package KeplerAs::KeplerAs::CODE; $code"; + $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg; + + $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg; + + my @newFile; + foreach my $line (split "\n", $file) + { + if ($line !~ m'^\s*(?:#|//).*') + { + $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg; + } + push @newFile, $line; + } + $file = join "\n", @newFile; + + $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg; + + my @schedBlocks = $file =~ /$ScheduleRe/g; + + foreach my $i (0 .. $#schedBlocks) + { + $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]); + + $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug); + } + + $file =~ s|$ScheduleRe| shift @schedBlocks |eg; + + return $file; +} + +my %srcReg = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X); +my %destReg = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC); +my %regops = (%srcReg, %destReg); +my @itypes = qw(class lat rlat tput dual); + +sub Scheduler +{ + my ($block, $blockNum, $regMap, $debug) = @_; + + my $vectors = $regMap->{__vectors}; + my $lineNum = 0; + + my (@instructs, @comments, $ordered, $first); + foreach my $line (split "\n", $block) + { + $lineNum++; + + unless (preProcessLine($line)) + { + push @comments, $line if $line =~ m'\S'; + next; + } + + if (my $inst = processAsmLine($line, $lineNum)) + { + $inst->{first} = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1; + + $inst->{exeTime} = 0; + $inst->{order} = $ordered++ if $ordered; + push @instructs, $inst; + } + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n"; + } + elsif ($line =~ m'^') + { + die "you cannot use nested tags" if $ordered; + $ordered = 1; + } + elsif ($line =~ m'^') + { + die "missing opening for closing tag" if !$ordered; + $ordered = 0; + } + else + { + die "badly formed line at block: $blockNum line: $lineNum: $line\n"; + } + } + + my (%writes, %reads, @ready, @schedule, $orderedParent); + foreach my $instruct (@instructs) + { + my $match = 0; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $capData = parseInstruct($instruct->{inst}, $gram) or next; + my (@dest, @src); + + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + + push @src, $instruct->{predReg} if $instruct->{pred}; + + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src; + my $mask = hex($capData->{i20w7}); + foreach my $p (0..6) + { + if ($mask & (1 << $p)) + { + push @$list, "P$p"; + } + elsif ($instruct->{op} eq 'R2P') + { + push @src, "P$p"; + } + } + } + + foreach my $operand (grep { exists $regops{$_} } sort keys %$capData) + { + my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src; + + my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT'; + + if ($capData->{$operand} ne $badVal) + { + push @$list, + $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) : + $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) : + $operand eq 'CC' ? 'CC' : + $operand eq 'X' ? 'CC' : + getRegNum($regMap, $capData->{$operand}); + } + } + $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39}); + + foreach my $src (grep { exists $writes{$_} } @src) + { + my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat}; + + foreach my $parent (@{$writes{$src}}) + { + my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat}; + push @{$parent->{children}}, [$instruct, $latency - $regLatency]; + $instruct->{parents}++; + + last unless $parent->{pred}; + } + } + + foreach my $dest (grep { exists $reads{$_} } @dest) + { + foreach my $reader (@{$reads{$dest}}) + { + push @{$reader->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + delete $reads{$dest} unless $instruct->{pred}; + } + + if ($instruct->{order}) + { + if ($orderedParent) + { + push @{$orderedParent->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + $orderedParent = $instruct; + } + elsif ($orderedParent) + { $orderedParent = 0; } + + unshift @{$writes{$_}}, $instruct foreach @dest; + + push @{$reads{$_}}, $instruct foreach @src; + + push @ready, $instruct if !exists $instruct->{parents}; + + $match = 1; + last; + } + die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match; + } + %writes = (); + %reads = (); + + if (@ready) + { + my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" }; + + countUniqueDescendants($readyParent, {}); + updateDepCounts($readyParent, {}); + + @ready = sort { + $a->{first} <=> $b->{first} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + my $clock = 0; + while (my $instruct = shift @ready) + { + my $stall = $instruct->{stall}; + + if (@schedule && $stall < 16) + { + my $prev = $schedule[$#schedule]; + + $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0; + $prev->{ctrl} |= $stall; + $clock += $stall; + } + else + { + $instruct->{ctrl} &= 0x1fff0; + $instruct->{ctrl} |= 1; + $clock += 1; + } + print "$clock: $instruct->{inst}\n" if $debug; + + push @schedule, $instruct; + + if (my $children = $instruct->{children}) + { + foreach (@$children) + { + my ($child, $latency) = @$_; + + my $earliest = $clock + $latency; + $child->{exeTime} = $earliest if $child->{exeTime} < $earliest; + + print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug; + + push @ready, $child if --$child->{parents} < 1; + } + delete $instruct->{children}; + } + + foreach my $ready (@ready) + { + $stall = $ready->{exeTime} - $clock; + $stall = 1 if $stall < 1; + + if ($ready->{class} eq $instruct->{class}) + { + $stall = $ready->{tput} if $stall < $ready->{tput}; + } + elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && + $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const})) + { + $stall = 0; + } + $ready->{stall} = $stall; + + $ready->{mix} = $ready->{class} ne $instruct->{class} || 0; + } + + @ready = sort { + $a->{first} <=> $b->{first} || + $a->{stall} <=> $b->{stall} || + $b->{mix} <=> $a->{mix} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + my $out; + $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule; + return $out; +} + +sub setConstMap +{ + my ($constMap, $constMapText) = @_; + + foreach my $line (split "\n", $constMapText) + { + $line =~ s|^\s+||; + $line =~ s{(?:#|//).*}{}; + $line =~ s|\s+$||; + next unless $line =~ m'\S'; + + my ($name, $value) = split '\s*:\s*', $line; + + $constMap->{$name} = $value; + } + return; +} + +sub setRegisterMap +{ + my ($regMap, $regmapText) = @_; + + my $vectors = $regMap->{__vectors} ||= {}; + my $regBank = $regMap->{__regbank} ||= {}; + my %aliases; + + foreach my $line (split "\n", $regmapText) + { + $line =~ s|^\s+||; + $line =~ s{(?:#|//).*}{}; + $line =~ s|\s+$||; + next unless $line =~ m'\S'; + + my $auto = $line =~ /~/; + my $share = $line =~ /=/; + + my ($regNums, $regNames) = split '\s*[:~=]\s*', $line; + + my (@numList, @nameList, %vecAliases); + foreach my $num (split '\s*,\s*', $regNums) + { + my ($start, $stop) = split '\s*\-\s*', $num; + die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop; + push @numList, ($start .. $stop||$start); + } + foreach my $fullName (split '\s*,\s*', $regNames) + { + if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$') + { + my ($name1, $name2, $bank) = ($1, $3, $4); + foreach (split '\s*\|\s*', $2) + { + my ($start, $stop) = split '\s*\-\s*'; + foreach my $r (map "$name1$_$name2", $start .. $stop||$start) + { + $aliases{$r} = "$name1$name2" unless exists $aliases{$r}; + push @nameList, $r; + $regBank->{$r} = $bank if $auto && defined $bank; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank; + } + } + } + elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$') + { + push @nameList, $1; + $regBank->{$1} = $2 if $auto && defined $2; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2; + } + else + { + die "Bad register name: '$fullName' at: $line\n"; + } + } + die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList; + die "Missmatched register mapping at: $line\n" if $share && @numList > 1; + + my $i = 0; + while ($i < $#numList-1) + { + last if $numList[$i] + 1 != $numList[$i+1]; + $i++; + } + my $ascending = $i+1 == $#numList; + + foreach my $n (0..$#nameList) + { + die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]}; + + if ($auto) + { + $regMap->{$nameList[$n]} = \@numList; + } + elsif ($share) + { + $regMap->{$nameList[$n]} = 'R' . $numList[0]; + } + else + { + $regMap->{$nameList[$n]} = 'R' . $numList[$n]; + if ($ascending && ($numList[$n] & 1) == 0) + { + my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3); + if ($end <= $#nameList) + { + $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ]; + if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}}) + { + $regMap->{$aliases{$nameList[$n]}} = $regMap->{$nameList[$n]}; + $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]}; + delete $aliases{$nameList[$n]}; + } + } + } + } + } + } +} + +sub preProcessLine +{ + $_[0] =~ s|^\s+||; + + my $val = shift; + + $val =~ s{(?:#|//).*}{}; + + return $val =~ m'\S'; +} + +sub countUniqueDescendants +{ + my ($node, $edges) = @_; + + + if (my $children = $node->{children}) + { + foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + + $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges); + } + } + else + { + return $node->{lineNum}; + } + return ($node->{lineNum}, keys %{$node->{deps}}); +} +sub updateDepCounts +{ + my ($node, $edges) = @_; + + + if (my $children = $node->{children}) + { + foreach my $child (@$children) + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + updateDepCounts($child->[0], $edges); + } + } + $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0; +} + +sub registerHealth +{ + my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_; + + my (@banks, @conflicts); + + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $slotHist = $reuseHistory->{$slot} ||= {}; + + $reuseHistory->{total}++; + + if (exists $slotHist->{$r}) + { + $reuseHistory->{reuse}++; + } + else + { + my $bank = substr($r,1) & 7; + + if ($banks[$bank] && $banks[$bank] ne $r) + { + push @conflicts, $banks[$bank] if !@conflicts; + push @conflicts, $r; + + $reuseHistory->{conflicts}++; + } + $banks[$bank] = $r; + } + + if ($reuseFlags & $reuseSlots{$slot}) + { $slotHist->{$r} = 1; } + else + { delete $slotHist->{$r}; } + } + if ($inst && @conflicts && !$nowarn) + { + printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts); + } + return scalar @conflicts; +} + +1; + +__END__ + +=head1 NAME + +KeplerAs::KeplerAs - Assembler for NVIDIA Maxwell architecture + +=head1 SYNOPSIS + + KeplerAs.pl [opts] + +=head1 DESCRIPTION + +See the documentation at: https://github.com/NervanaSystems/KeplerAs + +=head1 SEE ALSO + +See the documentation at: https://github.com/NervanaSystems/KeplerAs + + +=head1 AUTHOR + +Scott Gray, Esgray@nervanasys.com + +=head1 COPYRIGHT AND LICENSE + +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +=cut diff --git a/Assembler/KeplerAs/lib/KeplerAs/KeplerAsGrammar.pm b/Assembler/KeplerAs/lib/KeplerAs/KeplerAsGrammar.pm new file mode 100644 index 0000000..d372ea3 --- /dev/null +++ b/Assembler/KeplerAs/lib/KeplerAs/KeplerAsGrammar.pm @@ -0,0 +1,1659 @@ +package KeplerAs::KeplerAsGrammar; + +use strict; +use Carp; +use Exporter; +use Data::Dumper; +our @ISA = qw(Exporter); + +our @EXPORT = qw( + %grammar %flags + parseInstruct genCode genReuseCode + processAsmLine processSassLine processSassCtrlLine + replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters +); + +require 5.10.0; + +sub getI +{ + my ($orig, $pos, $mask) = @_; + my $val = $orig; + my $neg = $val =~ s|^\-||; + + if ($val =~ m'^(\d+)[xX]<([^>]+)>') + { + my $mul = $1; + my $exp = $2; + $exp =~ s/(?> $trunc) & 0x7ffff if $trunc; + } + return $val << $pos; +} +sub getR +{ + my ($val, $pos) = @_; + if ($val =~ m'^R(\d+|Z)$' && $1 < 255) + { + $val = $1 eq 'Z' ? 0xff : $1; + } + else + { + die "Bad register name found: $val\n"; + } + return $val << $pos; +} +sub getP +{ + my ($val, $pos) = @_; + if ($val =~ m'^P(\d|T)$' && $1 < 7) + { + $val = $1 eq 'T' ? 7 : $1; + } + else + { + die "Bad predicate name found: $val\n"; + } + return $val << $pos; +} +sub getC { ((hex($_[0]) >> 2) & 0x3fff) << 23 } + +my %operands = +( + p0 => sub { getP($_[0], 2) }, + p3 => sub { getP($_[0], 5) }, + p12 => sub { getP($_[0], 14) }, + p29 => sub { getP($_[0], 32) }, + p39 => sub { getP($_[0], 42) }, + p45 => sub { getP($_[0], 48) }, + p48 => sub { getP($_[0], 51) }, + p58 => sub { getP($_[0], 58) }, + r0 => sub { getR($_[0], 2) }, + r8 => sub { getR($_[0], 10) }, + r20 => sub { getR($_[0], 23) }, + r28 => sub { getR($_[0], 28) }, + r39s20 => sub { getR($_[0], 42) }, + r39 => sub { getR($_[0], 42) }, + r39a => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20 + c20 => sub { getC($_[0]) }, + z20 => sub { getC($_[0]) }, + c39 => sub { getC($_[0]) }, + c34 => sub { hex($_[0]) << 37 }, + c36 => sub { hex($_[0]) << 39 }, + f20w32 => sub { getF($_[0], 23, 'f') }, + f20 => sub { getF($_[0], 23, 'f', 12) }, + d20 => sub { getF($_[0], 23, 'd', 44) }, + i8w4 => sub { getI($_[0], 10, 0xf) }, + i20 => sub { getI($_[0], 23, 0x7ffff) }, + i20w6 => sub { getI($_[0], 23, 0x3f) }, + i20w7 => sub { getI($_[0], 23, 0x7f) }, + i20w8 => sub { getI($_[0], 23, 0xff) }, + i20w12 => sub { getI($_[0], 23, 0xfff) }, + i20w24 => sub { getI($_[0], 23, 0xffffff) }, + i20w32 => sub { getI($_[0], 23, 0xffffffff) }, + i31w4 => sub { getI($_[0], 34, 0xf) }, + i34w13 => sub { getI($_[0], 37, 0x1fff) }, + i36w20 => sub { getI($_[0], 36, 0xfffff) }, + i39w8 => sub { getI($_[0], 42, 0x1f) }, + i28w8 => sub { getI($_[0], 28, 0xff) }, + i28w20 => sub { getI($_[0], 31, 0xfffff) }, + i48w8 => sub { getI($_[0], 48, 0xff) }, + i51w5 => sub { getI($_[0], 51, 0x1f) }, + i53w5 => sub { getI($_[0], 53, 0x1f) }, + i23w6 => sub { getI($_[0], 23, 0x3f) }, +); + +my $hex = qr"0[xX][0-9a-fA-F]+"; +my $iAddr = qr"\d+[xX]<[^>]+>"; +my $immed = qr"$hex|$iAddr|\d+"o; +my $reg = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\ +my $p = qr"P[0-6T]"; +my $noPred = qr"(?)"; +my $pred = qr"\@(?\!)?P(?[0-6]) "; +my $p0 = qr"(?$p)"o; +my $p3 = qr"(?$p)"o; +my $p12 = qr"(?\!)?(?$p)"o; +my $p29 = qr"(?\!)?(?$p)"o; +my $p39 = qr"(?\!)?(?$p)"o; +my $p45 = qr"(?$p)"o; +my $p48 = qr"(?$p)"o; +my $p58 = qr"(?$p)"o; +my $r0 = qr"(?$reg)"; +my $r0cc = qr"(?$reg)(?\.CC)?"; +my $r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r28 = qr"(?$reg)"; +my $r39s20 = qr"(?\-)?(?\|)?(?(?$reg))\|?(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39 = qr"(?\-)?(?$reg)(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39a = qr"(?(?$reg))(?\.reuse)?"; +my $c20 = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20x = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20s39 = qr"(?\-)?c\[(?$hex)\]\s*\[(?$hex)\]"o; +my $f20w32 = qr"(?(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))"; +my $f20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $d20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $i8w4 = qr"(?$immed)"o; +my $i20 = qr"(?(?\-)?$immed)(?\.NEG)?"o; +my $i20w6 = qr"(?$immed)"o; +my $i20w7 = qr"(?$immed)"o; +my $i20w8 = qr"(?$immed)"o; +my $i20w12 = qr"(?$immed)"o; +my $i20w24 = qr"(?\-?$immed)"o; +my $i20w32 = qr"(?\-?$immed)"o; +my $i39w8 = qr"(?\-?$immed)"o; +my $i28w8 = qr"(?$immed)"o; +my $i28w20 = qr"(?\-?$immed)"o; +my $i31w4 = qr"(?$immed)"o; +my $i34w13 = qr"(?$immed)"o; +my $i36w20 = qr"(?$immed)"o; +my $i48w8 = qr"(?$immed)"o; +my $i51w5 = qr"(?$immed)"o; +my $i53w5 = qr"(?$immed)"o; +my $i23w6 = qr"(?$immed)"o; +my $ir20 = qr"$i20|$r20"o; +my $cr20 = qr"$c20|$r20"o; +my $icr20 = qr"$i20|$c20|$r20"o; +my $fcr20 = qr"$f20|$c20|$r20"o; +my $cr39 = qr"$c20s39|$r39"o; +my $dr20 = qr"$d20|$r20"o; + +my $u32 = qr"(?\.U32)?"; +my $REV2B = qr"(?\.REV2B)?"; +my $W = qr"(?\.W)?"; +my $pnot2d= qr"(?\.PNOT2D)?"; +my $ftz = qr"(?\.FTZ)?"; +my $sat = qr"(?\.SAT)?"; +my $rnd = qr"(?:\.(?RN|RM|RP|RZ))?"; +my $mulf = qr"(?:\.(?D2|D4|D8|M8|M4|M2))?"; +my $condition = qr"(?:(?F|LT|EQ|LE|GT|NE|GE|NUM|NAN|LTU|EQU|LEU|GTU|NEU|GEU|OFF|LO|SFF|LS|HI|SFT|HS|OFT))?"; +my $lane2a= qr"(?:\.(?LNONE|L0|L1|L01|L2|L02|L12|L012|L3|L03|L13|L013|L23|L023|L123))?"; +my $lane0e= qr"(?:\.(?LNONE|L0|L1|L01|L2|L02|L12|L012|L3|L03|L13|L013|L23|L023|L123))?"; + + +my $round = qr"(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?"; +my $fcmp = qr"(?\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)"; +my $icmp = qr"\.(?LT|EQ|LE|GT|NE|GE)"; +my $bool = qr"\.(?AND|OR|XOR|PASS_B)"; +my $bool2 = qr"\.(?AND|OR|XOR)"; +my $func = qr"\.(?COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)"; +my $rro = qr"\.(?SINCOS|EX2)"; +my $add3 = qr"(?:\.(?X|RS|LS))?"; +my $lopz = qr"(?:\.(?NZ|Z) $p48,|(?))"o; +my $X = qr"(?\.X)?"; +my $PO = qr"(?\.PO)?"; +my $bf = qr"(?\.BF)?"; +my $S = qr"(?\.S)?"; +my $tld = qr"(?NODEP\.)?(?:(?T)|(?P))"; +my $chnls = qr"(?R|RGBA)"; +my $sr = qr"SR_(?\S+)"; +my $shf = qr"(?\.W)?(?:\.(?U64|S64))?(?\.HI)?"; +my $imad = qr"(?:\.(?U32|S32))?(?:\.(?U32|S32))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $imadc = qr"(?:\.(?U32|S32))?(?:\.(?U32|S32))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $imul = qr"(?:\.(?U32|S32))?(?:\.(?U32|S32))?"; +my $vmad8 = qr"\.(?[SU])(?8|16)\.(?[SU])(?8|16)(?\.PO)?(?\.SHR_7)?(?\.SHR_15)?(?\.SAT)?"; +my $vmad16= qr"\.(?[SU])(?16)\.(?[SU])(?16)"; +my $hilo = qr"(?:\.(?XHI|XLO))?"; +my $hi = qr"(?:\.(?HI))?"; +my $vaddType = qr"(?:\.(?UD))?(?:\.(?SD))?(?:\.(?[SU])(?8|16|32))?(?:\.(?[SU])(?8|16|32))?"; +my $vaddMode = qr"(?:\.(?MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?"; +my $vmnmx = qr"(?:\.(?MX))?"; +my $x2x = qr"\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)"; +my $prmt = qr"(?:\.(?F4E|B4E|RC8|ECL|ECR|RC16))?"; +my $shfl = qr"\.(?IDX|UP|DOWN|BFLY)"; +my $bar = qr"\.(?SYNC|ARV|RED)(?:\.(?POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?()|(?))(?(), $p39|(?))"o; +my $b2r = qr"\.RESULT $r0(?:, $p45|(?))"o; +my $dbar = qr"(?SB0|SB1|SB2|SB3|SB4|SB5)"; +my $dbar2 = qr" {(?5)?,?(?4)?,?(?3)?,?(?2)?,?(?1)?,?(?0)?}"; +my $mbar = qr"\.(?CTA|GL|SYS)"; +my $addr = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i20w24)?\]"o; +my $addr2 = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i28w20)?\]"o; +my $ldc = qr"c\[(?$hex)\]\s*$addr"o; +my $atom = qr"(?\.E)?(?:\.(?ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)"; +my $vote = qr"\.(?ALL|ANY|EQ)"o; +my $memType = qr"(?\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)"; +my $memTypeX = qr"(?\.b32|\.b64|\.b96|\.b128)"; +my $memCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CI|CS|CV|IL|WT|LU))?"; +my $ldmemCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|LU|CV))?"; +my $stmemCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CS|WT))?"; + + + + +my $s2rT = {class => 's2r', lat => 2, blat => 25, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $smemT = {class => 'mem', lat => 2, blat => 30, rlat => 2, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $gmemT = {class => 'mem', lat => 2, blat => 200, rlat => 4, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $x32T = {class => 'x32', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 1}; +my $x64T = {class => 'x64', lat => 2, blat => 128, rlat => 0, rhold => 0, tput => 128, dual => 0, reuse => 1}; +my $shftT = {class => 'shift', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $cmpT = {class => 'cmp', lat => 13, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $qtrT = {class => 'qtr', lat => 8, blat => 0, rlat => 4, rhold => 0, tput => 1, dual => 1, reuse => 0}; +my $rroT = {class => 'rro', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $voteT = {class => 'vote', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; + + +our %grammar = +( + FADD => [ + { type => $x32T, code => 0xe2c0000000000002, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc2c0000000000001, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $f20;"o, }, + ], + FADD32I => [ { type => $x32T, code => 0x4000000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o, } ], + FCHK => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o, } ], #Partial? + FCMP => [ + { type => $cmpT, code => 0xdd00000000000002, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $cr20, $r39;"o, }, + { type => $cmpT, code => 0xdd00000000000002, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $r39s20, $c20s39;"o, }, + { type => $cmpT, code => 0xb500000000000001, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $f20, $r39;"o, }, + ], + FFMA => [ + { type => $x32T, code => 0xcc00000000000002, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $cr20, $r39;"o, }, + { type => $x32T, code => 0xcc00000000000002, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0x9400000000000001, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $f20, $r39;"o, }, + ], + FMNMX => [ + { type => $shftT, code => 0xe300000000000002, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $cr20, $p39;"o, }, + { type => $shftT, code => 0xc300000000000001, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $f20, $p39;"o, }, + ], + FMUL => [ + { type => $x32T, code => 0xe340000000000002, rule => qr"^$pred?FMUL$ftz$rnd$sat$mulf $r0, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc340000000000001, rule => qr"^$pred?FMUL$ftz$rnd$sat$mulf $r0, $r8, $f20;"o, }, + ], + FMUL32I => [ { type => $x32T, code => 0x2000000000000002, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o, } ], + FSET => [ + { type => $shftT, code => 0xc000000000000002, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $cr20, $p39;"o, }, + { type => $shftT, code => 0x8000000000000001, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $f20, $p39;"o, }, + ], + FSETP => [ { type => $cmpT, code => 0xdd80000000000002, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], + MUFU => [ { type => $qtrT, code => 0x8400000000000002, rule => qr"^$pred?MUFU$func $r0, $r8;"o, } ], + RRO => [ { type => $rroT, code => 0xe480000000000002, rule => qr"^$pred?RRO$rro $r0, $r20;"o, } ], + DADD => [ + { type => $x64T, code => 0xe380000000000002, rule => qr"^$pred?DADD$rnd $r0, $r8, $cr20;"o, }, + { type => $x64T, code => 0xc380000000000001, rule => qr"^$pred?DADD$rnd $r0, $r8, $d20;"o, }, + ], + DFMA => [ + { type => $x64T, code => 0xdb80000000000002, rule => qr"^$pred?DFMA$rnd $r0, $r8, $cr20, $r39;"o, }, + { type => $x64T, code => 0xdb80000000000002, rule => qr"^$pred?DFMA$rnd $r0, $r8, $d20, $r39;"o, }, + ], + DMNMX => [ + { type => $cmpT, code => 0xe280000000000002, rule => qr"^$pred?DMNMX $r0, $r8, $cr20, $p39;"o, }, + { type => $cmpT, code => 0xe280000000000002, rule => qr"^$pred?DMNMX $r0, $r8, $d20, $p39;"o, }, + ], + DMUL => [ + { type => $x64T, code => 0xe400000000000002, rule => qr"^$pred?DMUL$rnd $r0, $r8, $cr20;"o, }, + { type => $x64T, code => 0xc400000000000001, rule => qr"^$pred?DMUL$rnd $r0, $r8, $d20;"o, }, + ], + DSET => [ { type => $cmpT, code => 0xc800000000000002, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o, } ], + DSETP => [ { type => $cmpT, code => 0xdc00000000000002, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o, } ], + FSWZADD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o, } ], #TODO + + HADD2 => [ { type => $x32T, code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o, } ], + HMUL2 => [ { type => $x32T, code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o, } ], + HFMA2 => [ { type => $x32T, code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o, } ], + HSETP2 => [ { type => $cmpT, code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial + + BFE => [ + { type => $shftT, code => 0xe008000000000002, rule => qr"^$pred?BFE$u32$REV2B $r0, $r8, $cr20;"o, }, + { type => $shftT, code => 0xc008000000000001, rule => qr"^$pred?BFE$u32$REV2B $r0, $r8, $ir20;"o, }, + ], + BFI => [ + { type => $shftT, code => 0xdf80000000000002, rule => qr"^$pred?BFI$S $r0, $r8, $r20, $cr39;"o, }, + { type => $shftT, code => 0xb780000000000001, rule => qr"^$pred?BFI$S $r0, $r8, $i20, $cr39;"o, }, + ], + FLO => [ { type => $s2rT, code => 0xe180000000000002, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o, } ], + IADD => [ + { type => $x32T, code => 0xe080000000000002, rule => qr"^$pred?IADD$S$PO$sat$X $r0cc, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc080000000000001, rule => qr"^$pred?IADD$S$PO$sat$X $r0cc, $r8, $i20;"o, }, + ], + + ISUB => [ + { type => $x32T, code => 0xe088000000000002, rule => qr"^$pred?ISUB$sat$X $r0cc, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc088000000000001, rule => qr"^$pred?ISUB$sat$X $r0cc, $r8, $i20;"o, }, + { type => $x32T, code => 0xc090000000000001, rule => qr"^$pred?ISUB$sat$X $r0cc, $i20, $r8;"o, }, + ], + + + + IADD32I => [ { type => $x32T, code => 0x4000000000000001, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o, } ], + ICMP => [ + { type => $cmpT, code => 0xda08000000000002, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $cr20, $r39;"o, }, + { type => $cmpT, code => 0xda08000000000002, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $r39s20, $c20s39;"o, }, + { type => $cmpT, code => 0xb208000000000001, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $i20, $r39;"o, }, + ], + IMNMX => [ + { type => $shftT, code => 0xe108000000000002, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $cr20, $p39;"o, }, + { type => $shftT, code => 0xc108000000000001, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $i20, $p39;"o, }, + ], + ISET => [ + { type => $shftT, code => 0xda88000000000002, rule => qr"^$pred?ISET$bf$icmp$u32$X$bool$S $r0, $r8, $cr20, $p39;"o, }, + { type => $shftT, code => 0xb288000000000001, rule => qr"^$pred?ISET$bf$icmp$u32$X$bool$S $r0, $r8, $i20, $p39;"o, }, + ], + ISETP => [ + { type => $cmpT, code => 0xdb08000000000002, rule => qr"^$pred?ISETP$icmp$u32$X$bool$S $p3, $p0, $r8, $cr20, $p39;"o, }, + { type => $cmpT, code => 0xb308000000000001, rule => qr"^$pred?ISETP$icmp$u32$X$bool$S $p3, $p0, $r8, $i20, $p39;"o, }, + ], + ISCADD => [ + { type => $shftT, code => 0xe0c0000000000002, rule => qr"^$pred?ISCADD$X $r0cc, $r8, $cr20, $i39w8;"o, }, + { type => $shftT, code => 0xc0c0000000000001, rule => qr"^$pred?ISCADD$X $r0cc, $r8, $i20, $i39w8;"o, } + ], + ISCADD32I => [ { type => $shftT, code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o, } ], + + LOP => [ + { type => $x32T, code => 0xe200000000000002, rule => qr"^$pred?LOP$bool$S $r0, (?~)?$r8, (?~)?$cr20(?\.INV)?;"o, }, + { type => $x32T, code => 0xc200000000000001, rule => qr"^$pred?LOP$bool$S $r0, (?~)?$r8, (?~)?$i20(?\.INV)?;"o, }, + ], + LOP32I => [ { type => $x32T, code => 0x2000000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o, } ], + LOP3 => [ + { type => $x32T, code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o, }, + { type => $x32T, code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o, }, + ], + POPC => [ + { type => $s2rT, code => 0xe040000000000002, rule => qr"^$pred?POPC $r0, $r8, $cr20;"o, }, + { type => $s2rT, code => 0xc040000000000001, rule => qr"^$pred?POPC $r0, $r8, $i20;"o, }, + ], + SHF => [ + { type => $shftT, code => 0xdfc0000000000002, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0xb7c0000000000001, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $i20, $r39;"o, }, + { type => $shftT, code => 0xe7c0000000000002, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0xc7c0000000000001, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $i20, $r39;"o, }, + ], + SHL => [ + { type => $shftT, code => 0xe240000000000002, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $cr20;"o, }, + { type => $shftT, code => 0xc240000000000001, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $i23w6;"o, }, + ], + SHR => [ + { type => $shftT, code => 0xe148000000000002, rule => qr"^$pred?SHR$u32$W $r0, $r8, $cr20;"o, }, + { type => $shftT, code => 0xc148000000000001, rule => qr"^$pred?SHR$u32$W $r0, $r8, $i23w6;"o, }, + ], +IMAD => [ + { type => $x32T, code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $r20, $r39;"o, }, + { type => $x32T, code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $c20x, $r39;"o, }, + { type => $x32T, code => 0xa108000000000001, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $i20, $r39;"o, }, + ], + IMADSP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO + IMUL => [ + { type => $x32T, code => 0xe1c0180000000002, rule => qr"^$pred?IMUL$imul$hi $r0, $r8, $cr20;"o, }, + { type => $x32T, code => 0xc1c0180000000001, rule => qr"^$pred?IMUL$imul$hi $r0, $r8, $i20;"o, }, + ], + IMUL32I => [ + { type => $x32T, code => 0x2e00000000000002, rule => qr"^$pred?IMUL32I$imul$hi $r0, $r8, $i20w32;"o, }, + ], + + F2F => [ { type => $qtrT, code => 0xe540000000000002, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ], + F2I => [ { type => $qtrT, code => 0xe580000000000002, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o, } ], + I2F => [ { type => $qtrT, code => 0xe5c0000000000002, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o, } ], + I2I => [ { type => $qtrT, code => 0xe600000000000002, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o, } ], + F2ITRUNC => [ { type => $qtrT, code => 0xe5800c00051ca846, rule => qr"^$pred?F2ITRUNC[^;]*;"o, } ], + + MOV => [ { type => $x32T, code => 0xe4c03c0000000002, rule => qr"^$pred?MOV$lane2a$S $r0, $cr20;"o, } ], + MOV32I => [ { type => $x32T, code => 0x740000000003c002, rule => qr"^$pred?MOV32I$lane0e$S $r0, (?:$i20w32|$f20w32);"o, } ], + PRMT => [ + { type => $x32T, code => 0xde00000000000002, rule => qr"^$pred?PRMT$prmt $r0, $r8, $cr20, $cr39;"o, }, + { type => $x32T, code => 0xb600000000000001, rule => qr"^$pred?PRMT$prmt $r0, $r8, $i20, $r39;"o, }, + ], + SEL => [ + { type => $x32T, code => 0xe500000000000002, rule => qr"^$pred?SEL $r0, $r8, $cr20, $p39;"o, }, + { type => $x32T, code => 0xc500000000000001, rule => qr"^$pred?SEL $r0, $r8, $i20, $p39;"o, }, + ], + SHFL => [ { type => $smemT, code => 0x7880000000000002, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ], + + PSET => [ { type => $cmpT, code => 0x8440000000000002, rule => qr"^$pred?PSET$bf$bool2$bool $r0, $p12, $p29, $p39;"o, } ], + PSETP => [ { type => $cmpT, code => 0x8480000000000002, rule => qr"^$pred?PSETP$bool2$bool$S $p3, $p0, $p12, $p29, $p39;"o, } ], + CSET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o, } ], #TODO + CSETP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO + P2R => [ { type => $x32T, code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o, } ], + R2P => [ { type => $cmpT, code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o, } ], + + TLD => [ { type => $gmemT, code => 0x700a00067f9ffc02, rule => qr"^$pred?TLD[^;]*;"o, } ], #Partial + TLDzxx => [ { type => $gmemT, code => 0x700a00057f9ffc02, rule => qr"^$pred?TLDzxx[^;]*;"o, } ], #Partial + TEXDEPBAR => [ { type => $gmemT, code => 0x77000000001c0002, rule => qr"^$pred?TEXDEPBAR $i20w6;"o, } ], #Partial + TEX => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o, } ], #TODO + TLD4 => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o, } ], #TODO + TXQ => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o, } ], #TODO + + LD => [ { type => $gmemT, code => 0xc000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr;"o, } ], + LDY => [ { type => $gmemT, code => 0x7f80000000000002, rule => qr"^$pred?LDY $r0, $i20;"o, } ], + LDX => [ { type => $gmemT, code => 0x7ec0000000000002, rule => qr"^$pred?LDX$memTypeX $r0, $addr;"o, } ], + ST => [ { type => $gmemT, code => 0xe000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0;"o, } ], + LDG => [ + { type => $gmemT, code => 0x600010047f800001, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o, }, + ], + LDS => [ { type => $smemT, code => 0x7a40000000000002, rule => qr"^$pred?LDS$memCache$memType$S $r0, $addr;"o, } ], + STS => [ { type => $smemT, code => 0x7ac0000000000002, rule => qr"^$pred?STS$memCache$memType$S $addr, $r0;"o, } ], + LDL => [ { type => $gmemT, code => 0x7a00000000000002, rule => qr"^$pred?LDL$ldmemCache$memType$S $r0, $addr;"o, } ], + STL => [ { type => $gmemT, code => 0x7a80000000000002, rule => qr"^$pred?STL$stmemCache$memType$S $addr, $r0;"o, } ], + LDC => [ { type => $gmemT, code => 0x7c800000000ffc02, rule => qr"^$pred?LDC$memCache$memType$S $r0, $ldc;"o, } ], + ATOM => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + RED => [ { type => $gmemT, code => 0x68000000000003fe, rule => qr"^$pred?RED$atom $addr2, $r20;"o, } ], + CCTL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o, } ], #TODO + CCTLL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO + + SULD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o, } ], #TODO + + BRA => [ + { type => $x32T, code => 0x120000000000003c, rule => qr"^$pred?BRA(?\.U)? $i20w24;"o, }, + { type => $x32T, code => 0x1200000000000000, rule => qr"^$pred?BRA(?\.U)? CC\.$condition, $i20w24;"o, }, + ], + + BRX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o, } ], #TODO + JMP => [ + { type => $x32T, code => 0x108000000000003c, rule => qr"^$pred?JMP(?\.U)? $i20w32;"o, }, + { type => $x32T, code => 0x1080000000000000, rule => qr"^$pred?JMP(?\.U)? CC\.$condition, $i20w32;"o, }, + ], + JMX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o, } ], #TODO + SSY => [ { type => $x32T, code => 0x1480000000000000, rule => qr"^$noPred?SSY $i20w24;"o, } ], + + CAL => [ { type => $x32T, code => 0x1300000000000100, rule => qr"^$noPred?CAL $i20w24;"o, } ], + JCAL => [ { type => $x32T, code => 0x1100000000000100, rule => qr"^$noPred?JCAL $i20w32;"o, } ], + PRET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o, } ], #TODO + RET => [ + { type => $x32T, code => 0x190000000000003c, rule => qr"^$pred?RET;"o, }, + { type => $x32T, code => 0x1900000000000000, rule => qr"^$pred?RET CC\.$condition;"o, }, + ], + BRK => [ { type => $x32T, code => 0x1a0000000000003c, rule => qr"^$pred?BRK;"o, } ], + PBK => [ { type => $x32T, code => 0x1500000000000000, rule => qr"^$noPred?PBK $i20w24;"o, } ], + CONT => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o, } ], + PCNT => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o, } ], + EXIT => [ + { type => $x32T, code => 0x18000000001c003c, rule => qr"^$pred?EXIT;"o, }, + { type => $x32T, code => 0x18000000001c0000, rule => qr"^$pred?EXIT CC\.$condition;"o, }, + ], + BPT => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o, } ], + + NOP => [ { type => $x32T, code => 0x8580000000003c02, rule => qr"^$pred?NOP$S;"o, } ], + S2R => [ { type => $s2rT, code => 0x8640000000000002, rule => qr"^$pred?S2R$S $r0, $sr;"o, } ], + B2R => [ { type => $x32T, code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o, } ], + BAR => [ + { type => $gmemT, code => 0x8540dc0000000002, rule => qr"^$pred?BAR.SYNC $i8w4;"o, }, + { type => $gmemT, code => 0x8540dc0000000002, rule => qr"^$pred?BAR.SYNC $i8w4, $i20w12;"o, }, + { type => $gmemT, code => 0x85409c0000000002, rule => qr"^$pred?BAR.SYNC $i8w4, $r20;"o, }, + { type => $gmemT, code => 0x85405c0000000002, rule => qr"^$pred?BAR.SYNC $r8;"o, }, + { type => $gmemT, code => 0x85405c0000000002, rule => qr"^$pred?BAR.SYNC $r8, $i20w12;"o, }, + { type => $gmemT, code => 0x85401c0000000002, rule => qr"^$pred?BAR.SYNC $r8, $r20;"o, }, + { type => $gmemT, code => 0x8540dc0800000002, rule => qr"^$pred?BAR.ARV $i8w4, $i20w12;"o, }, + { type => $gmemT, code => 0x85409c0800000002, rule => qr"^$pred?BAR.ARV $i8w4, $r20;"o, }, + { type => $gmemT, code => 0x85405c0800000002, rule => qr"^$pred?BAR.ARV $r8, $i20w12;"o, }, + { type => $gmemT, code => 0x85401c0800000002, rule => qr"^$pred?BAR.ARV $r8, $r20;"o, }, + ], + DEPBAR => [ + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, }, + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o, }, + ], + MEMBAR => [ { type => $x32T, code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o, } ], + + VOTE => [ + { type => $voteT, code => 0x86c0000000000002, rule => qr"^$pred?VOTE$vote (?:$r0, |(?))$p45, $p39;"o, } ], + + + VADD => [ { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMAD => [ + { type => $x32T, code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $i20, $r39;"o, }, + { type => $x32T, code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, }, + ], + VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMNMX => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + + VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 +); + +my @flags = grep /\S/, split "\n", q{; + +BFE, BFI, FLO, IADD, ISUB, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD +0x0800000000000000 neg + +FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP +0x0800000000000000 neg + +PSET, PSETP +0x0000000000020000 p12not +0x0000000800000000 p29not + +FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE +0x0000200000000000 p39not + +IADD32I +0x0010000000000000 CC + +IMAD, PSET, FSET, DSET, ISET, IADD, ISUB, IMUL, ISCADD +0x0004000000000000 CC + +IMAD: mode +0x0200000000000000 HI + +IMAD +0x0010000000000000 X + +IMUL: mode +0x0000040000000000 HI + +IMUL32I: mode +0x0100000000000000 HI + +FFMA, FADD, FCMP, FMUL, FMNMX, FSWZ, FSET, FSETP, FCHK, RRO, MUFU, DFMA, DADD, DMUL, DMNMX, DSET, DSETP, IMAD, IMADSP, IMUL, IADD, ISCADD, ISAD, IMNMX, BFE, BFI, SHR, SHL, SHF, LOP, FLO, ISET, ISETP, ICMP, POPC, F2F, F2I, I2F, I2I, MOV, MOV32I, SEL, PRMT, SHFL, P2R, R2P, CSET, CSETP, PSET, PSETP, TEX, TLD, TLD4, TXQ, LDC, LD, LDG, LDL, LDS, LDSLK, ST, STL, STS, STSCUL, ATOM, RED, CCTL, CCTLL, MEMBAR, SUCLAMP, SUBFM, SUEAU, SULDGA, SUSTGA, BRA, BRX, RET, BRK, CONT, NOP, S2R, B2R, BAR, VOTE, MOV +0x0000000000400000 S + +SHF +0x0020000000000000 W +0x0001000000000000 HI + +SHF: type +0x0000020000000000 U64 +0x0000010000000000 S64 + +IMAD, ICMP, ISET, ISETP, ISAD, SHR, IMNMX, FLO, BFE +0x0008000000000000 U32 + +SHR, SHL +0x0000040000000000 W + +SHFL +0x0000000080000000 i20w8 +0x0000000100000000 i34w13 + +SHFL: mode +0x0000000000000000 IDX +0x0000000200000000 UP +0x0000000300000000 DOWN +0x0000000600000000 BFLY + +IMNMX: mode +0x0000080000000000 XLO +0x0000180000000000 XHI + +ISETP, ISET, ICMP: cmp +0x0010000000000000 LT +0x0020000000000000 EQ +0x0030000000000000 LE +0x0040000000000000 GT +0x0050000000000000 NE +0x0060000000000000 GE + +ISETP, ISET, PSETP, PSET, FSET, FSETP, DSET, DSETP: bool +0x0000000000000000 AND +0x0001000000000000 OR +0x0002000000000000 XOR + +PSETP, PSET: bool2 +0x0000000000000000 AND +0x0000000008000000 OR +0x0000000010000000 XOR + +ISETP, ISET, IADD, ISUB +0x0000400000000000 X + +ISCADD +0x0020000000000000 X + +ISET, PSET +0x0000800000000000 BF + +LOP: bool +0x0000000000000000 AND +0x0000100000000000 OR +0x0000200000000000 XOR +0x0000300000000000 PASS_B + +LOP, POPC, FLO +0x0000080000000000 INV + +LOP, POPC, IADD, ISUB +0x0000040000000000 INV1 + +LOP: z +0x0000200000000000 Z +0x0000300000000000 NZ + +LOP +0x0000000000000000 noz + +LOP32I: bool +0x0000000000000000 AND +0x0020000000000000 OR +0x0040000000000000 XOR + +PRMT: mode +0x0008000000000000 F4E +0x0010000000000000 B4E +0x0018000000000000 RC8 +0x0020000000000000 ECL +0x0028000000000000 ECR +0x0030000000000000 RC16 + +IMAD: type1 +0x0008000000000000 U32 +0x0008000000000000 S32 + +IMAD: type2 +0x0100000000000000 U32 +0x0100000000000000 S32 + +IMUL: type1 +0x0000080000000000 U32 +0x0000000000000000 S32 + +IMUL: type2 +0x0000100000000000 U32 +0x0000000000000000 S32 + +IMUL32I: type1 +0x0200000000000000 U32 +0x0000000000000000 S32 + +IMUL32I: type2 +0x0400000000000000 U32 +0x0000000000000000 S32 + +XMAD: type1 +0x0000000000000000 U16 +0x0001000000000000 S16 + +XMAD: type2 +0x0000000000000000 U16 +0x0002000000000000 S16 + +XMAD: mode +0x0000002000000000 MRG +0x0000001000000000 PSL +0x0008000000000000 CHI +0x0004000000000000 CLO +0x000c000000000000 CSFU + +XMAD: modec +0x0004000000000000 CLO +0x0008000000000000 CHI +0x000c000000000000 CSFU +0x0040000000000000 X +0x0080000000000000 PSL +0x0100000000000000 MRG + +XMAD +0x0010000000000000 CBCC + +XMAD: r8part +0x0000000000000000 H0 +0x0020000000000000 H1 + +XMAD: r20part +0x0000000000000000 H0 +0x0000000800000000 H1 + +XMAD: r20partx +0x0000000000000000 H0 +0x0010000000000000 H1 + +XMAD: r39part +0x0000000000000000 H0 +0x0010000000000000 H1 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part +0x0000000000000000 B0 +0x0000001000000000 B1 +0x0000002000000000 B2 +0x0000003000000000 B3 +0x0000001000000000 H1 +0x0000000000000000 H0 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part +0x0000000000000000 B0 +0x0000000010000000 B1 +0x0000000020000000 B2 +0x0000000030000000 B3 +0x0000000010000000 H1 +0x0000000000000000 H0 + +VMAD +0x0040000000000000 r8neg +0x0020000000000000 r39neg +0x0008000000000000 SHR_7 +0x0010000000000000 SHR_15 +0x0060000000000000 PO +0x0080000000000000 SAT + +VMNMX +0x0100000000000000 MX + +VADD, VABSDIFF, VMNMX +0x0080000000000000 SAT +0x0040000000000000 UD +0x0040000000000000 SD + +VSET: cmp +0x0040000000000000 LT +0x0080000000000000 EQ +0x00c0000000000000 LE +0x0100000000000000 GT +0x0140000000000000 NE +0x0180000000000000 GE + +VADD, VSET: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VABSDIFF: mode +0x0003000000000000 ACC +0x000b000000000000 MIN +0x0013000000000000 MAX +0x0023000000000000 MRG_16H +0x002b000000000000 MRG_16L +0x0033000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x003b000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMNMX: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1 +0x0000000000000000 U +0x0004000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2 +0x0000000000000000 U +0x0008000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size1 +0x0000000000000000 8 +0x0000004000000000 16 +0x0000006000000000 32 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size2 +0x0000000000000000 8 +0x0000000000000000 16 +0x0000000000000000 32 + +IADD3: type +0x0001000000000000 X +0x0000002000000000 RS +0x0000004000000000 LS + +IADD3: r8part +0x0000000000000000 H0 +0x0000001000000000 H1 + +IADD3: r20part +0x0000000080000000 H0 + +IADD3: r39part +0x0000000200000000 H0 + +IADD3 +0x0008000000000000 r8neg +0x0004000000000000 r20neg +0x0002000000000000 r39neg + +IADD, ISUB, ISCADD +0x0010000000000000 r8neg +0x0008000000000000 r20neg +0x0018000000000000 PO + +IADD32I +0x0100000000000000 X +0x0800000000000000 r8neg + +IMAD +0x0080000000000000 r8neg + +IMAD +0x0040000000000000 r39neg + +DEPBAR: SB +0x0000000000000000 SB0 +0x0000000004000000 SB1 +0x0000000008000000 SB2 +0x000000000c000000 SB3 +0x0000000010000000 SB4 +0x0000000014000000 SB5 + +DEPBAR: cmp +0x0000000020000000 LE + +DEPBAR +0x0000000000000001 db0 +0x0000000000000002 db1 +0x0000000000000004 db2 +0x0000000000000008 db3 +0x0000000000000010 db4 +0x0000000000000020 db5 + +F2F, F2I, I2F, I2I: destWidth +0x0000000000000000 8 +0x0000000000000400 16 +0x0000000000000800 32 +0x0000000000000c00 64 + +F2F, F2I, I2F, I2I: srcWidth +0x0000000000000000 8 +0x0000000000001000 16 +0x0000000000002000 32 +0x0000000000003000 64 + +F2F, F2I, I2F, I2I: destSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000008000 S + +F2F, F2I, I2F, I2I: srcSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000008000 S + +F2I, I2F, I2I: r20part +0x0000000000000000 H0 +0x0000040000000000 H1 +0x0000000000000000 B0 +0x0000020000000000 B1 +0x0000040000000000 B2 +0x0000060000000000 B3 + +F2F: r20part +0x0000000000000000 H0 +0x0000020000000000 H1 + +F2F: round +0x0000040000000000 ROUND +0x0000048000000000 FLOOR +0x0000050000000000 CEIL +0x0000058000000000 TRUNC + +F2I: round +0x0000000000000000 ROUND +0x0000040000000000 FLOOR +0x0000080000000000 CEIL +0x00000c0000000000 TRUNC + +HADD2, HMUL2: r8part +0x0001000000000000 H0_H0 +0x0000000000000000 H1_H1 + +HFMA2: r20part +0x0000000020000000 H0_H0 +0x0000000030000000 H1_H1 + +FADD, DADD, FMUL, DMUL, F2F, I2F: rnd +0x0000000000000000 RN +0x0000040000000000 RM +0x0000080000000000 RP +0x00000c0000000000 RZ + +FMUL: mulf +0x0000100000000000 D2 +0x0000200000000000 D4 +0x0000300000000000 D8 +0x0000400000000000 M8 +0x0000500000000000 M4 +0x0000600000000000 M2 + +BRA, JMP, RET, EXIT: CON +0x0000000000000000 F +0x0000000000000004 LT +0x0000000000000008 EQ +0x000000000000000c LE +0x0000000000000010 GT +0x0000000000000014 NE +0x0000000000000018 GE +0x000000000000001c NUM +0x0000000000000020 NAN +0x0000000000000024 LTU +0x0000000000000028 EQU +0x000000000000002c LEU +0x0000000000000030 GTU +0x0000000000000034 NEU +0x0000000000000038 GEU +0x0000000000000040 OFF +0x0000000000000044 LO +0x0000000000000048 SFF +0x000000000000004c LS +0x0000000000000050 HI +0x0000000000000054 SFT +0x0000000000000058 HS +0x000000000000005c OFT + +MOV: lane2a +0x0000380000000000 LNONE +0x0000340000000000 L0 +0x0000300000000000 L1 +0x00002c0000000000 L01 +0x0000280000000000 L2 +0x0000240000000000 L02 +0x0000200000000000 L12 +0x00001c0000000000 L3 +0x0000180000000000 L03 +0x0000140000000000 L13 +0x0000100000000000 L013 +0x00000c0000000000 L23 +0x0000080000000000 L023 +0x0000040000000000 L123 + +MOV32I: lane0e +0x0000000000038000 LNONE +0x0000000000034000 L0 +0x0000000000030000 L1 +0x000000000002c000 L01 +0x0000000000028000 L2 +0x0000000000024000 L02 +0x0000000000020000 L12 +0x000000000001c000 L3 +0x0000000000018000 L03 +0x0000000000014000 L13 +0x0000000000010000 L013 +0x000000000000c000 L23 +0x0000000000008000 L023 +0x0000000000004000 L123 + +DFMA: rnd +0x0000000000000000 RN +0x0004000000000000 RM +0x0008000000000000 RP +0x000c000000000000 RZ + +FFMA: rnd +0x0000000000000000 RN +0x0040000000000000 RM +0x0080000000000000 RP +0x00c0000000000000 RZ + +FFMA, FMUL32I +0x0100000000000000 FTZ + +F2F, F2I, FADD, FMUL, FMNMX +0x0000800000000000 FTZ + +FADD32I +0x0080000000000000 FTZ + +FMUL32I +0x0020000000000000 FTZ + +FSET, FSETP, FCMP, DSET, DSETP +0x0400000000000000 FTZ + +HADD2, HMUL2 +0x0000008000000000 FTZ + +HFMA2 +0x0000002000000000 FTZ + +FADD, FFMA, FMUL, F2F, I2I, MUFU, IMAD, IADD, ISUB +0x0020000000000000 SAT + +FADD, DADD, FMNMX, DMNMX, MUFU, FFMA, DFMA, FMUL, DADD, DMUL +0x0008000000000000 r8neg + +FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I +0x0001000000000000 r20neg + +FMUL, DMUL, FFMA, DFMA +0x0001000000000000 r20neg + +FFMA, DFMA +0x0010000000000000 r39neg + +FADD, DADD, FMNMX, DMNMX, MUFU +0x0002000000000000 r8abs + +FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I +0x0010000000000000 r20abs + +FSETP, DSETP, FSET, DSET +0x0000400000000000 r8neg +0x0100000000000000 r20neg +0x0200000000000000 r8abs +0x0000800000000000 r20abs + +RRO: func +0x0000000000000000 SINCOS +0x0000040000000000 EX2 + +MUFU: func +0x0000000000000000 COS +0x0000000000800000 SIN +0x0000000001000000 EX2 +0x0000000001800000 LG2 +0x0000000002000000 RCP +0x0000000002800000 RSQ +0x0000000003000000 RCP64H +0x0000000003800000 RSQ64H + +FSETP, DSETP, FSET, DSET, FCMP: cmp +0x0008000000000000 .LT +0x0010000000000000 .EQ +0x0018000000000000 .LE +0x0020000000000000 .GT +0x0020000000000000 +0x0028000000000000 .NE +0x0030000000000000 .GE +0x0038000000000000 .NUM +0x0040000000000000 .NAN +0x0048000000000000 .LTU +0x0050000000000000 .EQU +0x0058000000000000 .LEU +0x0060000000000000 .GTU +0x0068000000000000 .NEU +0x0070000000000000 .GEU + +FSETP, DSETP, FSET, DSET: bool +0x0000000000000000 AND +0x0001000000000000 OR +0x0002000000000000 XOR + +HSETP2: cmp +0x0000002800000000 .NE + +HSETP2: bool +0x0000000000000000 AND + +S2R: sr +0x0000000000000000 LANEID +0x0000000001000000 VIRTCFG +0x0000000001800000 VIRTID +0x0000000002000000 PM0 +0x0000000002800000 PM1 +0x0000000003000000 PM2 +0x0000000003800000 PM3 +0x0000000004000000 PM4 +0x0000000004800000 PM5 +0x0000000005000000 PM6 +0x0000000005800000 PM7 +0x0000000008000000 PRIM_TYPE +0x0000000008800000 INVOCATION_ID +0x0000000009000000 Y_DIRECTION +0x0000000010000000 TID +0x0000000010800000 TID.X +0x0000000011000000 TID.Y +0x0000000011800000 TID.Z +0x0000000012000000 CTA_PARAM +0x0000000012800000 CTAID.X +0x0000000013000000 CTAID.Y +0x0000000013800000 CTAID.Z +0x0000000014000000 NTID +0x0000000014800000 CirQueueIncrMinusOne +0x0000000015000000 NLATC +0x0000000015800000 43 +0x0000000016000000 44 +0x0000000016800000 45 +0x0000000017000000 46 +0x0000000017800000 47 +0x0000000018000000 SWINLO +0x0000000018800000 SWINSZ +0x0000000019000000 SMEMSZ +0x0000000019800000 SMEMBANKS +0x000000001a000000 LWINLO +0x000000001a800000 LWINSZ +0x000000001b000000 LMEMLOSZ +0x000000001b800000 LMEMHIOFF +0x000000001c000000 EQMASK +0x000000001c800000 LTMASK +0x000000001d000000 LEMASK +0x000000001d800000 GTMASK +0x000000001e000000 GEMASK +0x0000000020000000 GLOBALERRORSTATUS +0x0000000021000000 WARPERRORSTATUS +0x0000000028000000 CLOCKLO +0x0000000029000000 GLOBALTIMERLO +0x0000000029800000 GLOBALTIMERHI + +CS2R: sr +0x0000000005000000 CLOCKLO +0x0000000005100000 CLOCKHI +0x0000000005200000 GLOBALTIMERLO +0x0000000005300000 GLOBALTIMERHI + +B2R +0x0000e00000000000 nop45 + +BAR: red +0x0000000000000000 POPC +0x0000000800000000 AND +0x0000001000000000 OR + +MEMBAR: mode +0x0000000000000000 CTA +0x0000000000000100 GL +0x0000000000000200 SYS + +VOTE: mode +0x0000000000000000 ALL +0x0008000000000000 ANY +0x0010000000000000 EQ + +VOTE +0x00000000000003fc nor0 + +BRA +0x0000000000000200 U + +TLDS: chnls +0x0010000000000000 RGBA + +TLDS +0x0002000000000000 NODEP + +LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS +0x0000000000000000 nor8 + +LD, ST: type +0x0000000000000000 .U8 +0x0100000000000000 .S8 +0x0200000000000000 .U16 +0x0300000000000000 .S16 +0x0400000000000000 +0x0400000000000000 .32 +0x0500000000000000 .64 +0x0600000000000000 .128 + +LDX: type +0x0000000000000000 .b32 +0x0004000000000000 .b64 +0x0008000000000000 .b96 +0x000c000000000000 .b128 + +LD, ST: cache +0x0000000000000000 CG +0x1000000000000000 CS +0x1800000000000000 CV +0x1800000000000000 WT + +STG, LDS, STS, LDL, STL, LDC: type +0x0000000000000000 .U8 +0x0008000000000000 .S8 +0x0010000000000000 .U16 +0x0018000000000000 .S16 +0x0020000000000000 +0x0020000000000000 .32 +0x0028000000000000 .64 +0x0030000000000000 .128 + +LDG: type +0x0000000000000000 .U8 +0x0000800000000000 .S8 +0x0001000000000000 .U16 +0x0001800000000000 .S16 +0x0002000000000000 +0x0002000000000000 .32 +0x0002800800000000 .64 +0x0003003800000000 .128 + +LDG, STG: cache +0x0000000000000000 CG +0x0000000000000000 CI +0x0000040000000000 CS +0x0000000000000000 CV +0x0000000000000000 WT + +LDG +0x0000008000000000 E + +LDL: cache +0x0000200000000000 CI + +LDL, STL: cache +0x0000800000000000 CG +0x0001000000000000 LU +0x0001800000000000 CV +0x0001800000000000 WT + +LDC: cache +0x0000100000000000 IL + +STG, LDS, STS, LDL, STL, LDC +0x0000200000000000 E + +LDS +0x0008000000000000 U + +RED: type +0x0000000000000000 +0x0010000000000000 .S32 +0x0020000000000000 .U64 +0x0030000000000000 .F32.FTZ.RN +0x0040000000000000 .F16x2.FTZ.RN +0x0050000000000000 .S64 + +RED: mode +0x0000000000000000 ADD +0x0080000000000000 MIN +0x0100000000000000 MAX +0x0180000000000000 INC +0x0200000000000000 DEC +0x0280000000000000 AND +0x0300000000000000 OR +0x0380000000000000 XOR + +ATOM: type +0x0000000000000000 +0x0002000000000000 .S32 +0x0004000000000000 .U64 +0x0006000000000000 .F32.FTZ.RN +0x0008000000000000 .F16x2.FTZ.RN +0x000a000000000000 .S64 +0x0002000000000000 .64 + +ATOM, RED +0x0008000000000000 E + +LD, ST +0x0080000000000000 E + +ATOM: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x03f0000000000000 CAS + +ATOMS: type +0x0000000000000000 +0x0000000010000000 .S32 +0x0000000020000000 .U64 +0x0000000030000000 .S64 +0x0010000000000000 .64 + +ATOMS: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x0240000000000000 CAS + +BFE:REV2B +0x0000080000000000 REV2B +}; + +our %flags; +my (@ops, $flag); +foreach my $line (@flags) +{ + if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)') + { + my $val = hex($1); + if ($flag) + { $flags{$_}{$flag}{$2} = $val foreach @ops; } + else + { $flags{$_}{$2} = $val foreach @ops; } + } + else + { + my ($ops, $name) = split ':\s*', $line; + @ops = split ',\s*', $ops; + $flag = $name; + } +} + +sub parseInstruct +{ + my ($inst, $grammar) = @_; + return unless $inst =~ $grammar->{rule}; + my %capData = %+; + return \%capData; +} + +my %immedOps = map { $_ => 1 } qw(i20 f20 d20); +my %immedCodes = +( + 0x5c => 0x64, + 0x5b => 0x6d, + 0x59 => 0x6b, + 0x58 => 0x68, +); +my %constCodes = +( + c20 => 0x2, + c39 => 0x1, +); +my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4); + +sub genReuseCode +{ + my $capData = shift; + my $reuse = 0; + $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes; + return $reuse; +} + +sub genCode +{ + my ($op, $grammar, $capData, $test) = @_; + + my $flags = $flags{$op}; + my $code = $grammar->{code}; + my $reuse = 0; + + + if (exists $capData->{noPred}) + { + delete $capData->{noPred}; + push @$test, 'noPred' if $test; + } + else + { + my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7; + push @$test, 'predNum' if $test; + if (exists $capData->{predNot}) + { + $p |= 8; + push @$test, 'predNot' if $test; + } + $code |= $p << 18; + delete @{$capData}{qw(predNum predNot)}; + + } + foreach my $rcode (qw(reuse1 reuse2 reuse3)) + { + if (delete $capData->{$rcode}) + { + $reuse |= $reuseCodes{$rcode}; + push @$test, $rcode if $test; + } + } + + foreach my $capture (keys %$capData) + { + if (exists $constCodes{$capture}) + { $code ^= $constCodes{$capture} << 62; } + + if (exists $operands{$capture}) + { + unless ($capture eq 'r20' && exists $capData->{r39s20}) + { + $code ^= $operands{$capture}->($capData->{$capture}); + push @$test, $capture if $test; + } + } + + if (exists $flags->{$capture}) + { + if (ref $flags->{$capture}) + { + $code ^= $flags->{$capture}{$capData->{$capture}}; + push @$test, "$capture:$capData->{$capture}" if $test; + } + else + { + $code ^= $flags->{$capture}; + push @$test, $capture if $test; + } + } + elsif (!exists $operands{$capture} && !$test) + { + warn "UNUSED: $op: $capture: $capData->{$capture}\n"; + warn Dumper($flags); + } + } + + return $code, $reuse; +} + + +my $CtrlRe = qr'(?[T\-]:[G\-]:[D\-]:[S\-]:[0-9]{2})'; +my $PredRe = qr'(?@!?(?P\d)\s+)'; +my $InstRe = qr"$PredRe?(?\w+)(?[^;]*;)"o; +my $CommRe = qr'(?.*)'; + +sub processAsmLine +{ + my ($line, $lineNum) = @_; + + if ($line =~ m"^$CtrlRe(?\s+)$InstRe$CommRe"o) + { + return { + lineNum => $lineNum, + pred => $+{pred}, + predReg => $+{predReg}, + space => $+{space}, + op => $+{op}, + comment => $+{comment}, + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + ctrl => readCtrl($+{ctrl}, $line), + }; + } + return undef; +} + +sub processSassLine +{ + my $line = shift; + + if ($line =~ m"^\s+/\*(?[0-9a-f]+)\*/\s+$InstRe\s+/\* (?0x[0-9a-f]+)"o) + { + return { + num => hex($+{num}), + pred => $+{pred}, + op => $+{op}, + ins => normalizeSpacing($+{op} . $+{rest}), + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + code => hex($+{code}), + }; + } + return undef; +} + +sub processSassCtrlLine +{ + my ($line, $ctrl, $ruse) = @_; + + return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)'; + + my $code = hex($1); + if (ref $ctrl) + { + push @$ctrl, ($code & 0x00000000000003fc) >> 2; + push @$ctrl, ($code & 0x000000000003fc00) >> 10; + push @$ctrl, ($code & 0x0000000003fc0000) >> 18; + push @$ctrl, ($code & 0x00000003fc000000) >> 26; + push @$ctrl, ($code & 0x000003fc00000000) >> 34; + push @$ctrl, ($code & 0x0003fc0000000000) >> 42; + push @$ctrl, ($code & 0x03fc000000000000) >> 50; + } + if (ref $ruse) + { + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + push @$ruse, ($code & 0x7800000000000000) >> 59; + } + return 1; +} + +sub replaceXMADs +{ + my $file = shift; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD\.LO\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s +%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s; +%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c x comment)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?-?$immed|\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/ + + die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + return $file; +} +sub normalizeSpacing +{ + my $inst = shift; + $inst =~ s/\t/ /g; + $inst =~ s/\s{2,}/ /g; + return $inst; +} + + +sub printCtrl +{ + my $code = shift; + + my $stall = ($code & 0x0f) >> 0; + my $sharedbar = ($code & 0x10) >> 4; + my $dual_issue = ($code & 0x20) >> 5; + my $globalbar = ($code & 0x40) >> 6; + my $texbar = ($code & 0x80) >> 7; + + $texbar = $texbar ? 'T' : '-'; + $globalbar = $globalbar ? 'G' : '-'; + $dual_issue = $dual_issue ? '-' : 'D'; + $sharedbar = $sharedbar ? 'S' : '-'; + $stall = sprintf('%02d', $stall); + return sprintf '%s:%s:%s:%s:%02d', $texbar, $globalbar, $dual_issue, $sharedbar, $stall; +} +sub readCtrl +{ + my ($ctrl, $context) = @_; + my ($texbar, $globalbar, $dual_issue, $sharedbar, $stall) = split ':', $ctrl; + + $texbar= $texbar eq 'T' ? 1 : 0; + $globalbar= $globalbar eq 'G' ? 1 : 0; + $dual_issue= $dual_issue eq 'D' ? 0 : 1; + $sharedbar= $sharedbar eq 'S' ? 1 : 0; + $stall = sprintf("%d", $stall); + + + + return + $texbar << 7 | + $globalbar << 6 | + $dual_issue << 5 | + $sharedbar << 4 | + $stall; +} + +sub getRegNum +{ + my ($regMap, $regName) = @_; + + return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName}; +} + +sub getVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r0} or return; + + return if $regName eq 'RZ'; + + if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+3); + } + confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4; + return @{$vectors->{$regName}}; + } + return $regName; +} + +sub getAddrVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r8} or return; + + return if $regName eq 'RZ'; + + if (exists $capData->{E}) + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + print Dumper($vectors) unless exists $vectors->{$regName}; + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + return $regName; +} + +__END__ + + + diff --git a/Assembler/KeplerAs/pm_to_blib b/Assembler/KeplerAs/pm_to_blib new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/Changes b/Assembler/MaxAs/Changes new file mode 100644 index 0000000..a6d8a13 --- /dev/null +++ b/Assembler/MaxAs/Changes @@ -0,0 +1,4 @@ +Revision history for Perl extension MaxAs::MaxAs. + +1.01 Thu Mar 26 17:09:57 2015 + - original Perl packaged version diff --git a/Assembler/MaxAs/Install.sh b/Assembler/MaxAs/Install.sh new file mode 100755 index 0000000..57c8d24 --- /dev/null +++ b/Assembler/MaxAs/Install.sh @@ -0,0 +1,3 @@ +perl Makefile.PL +make +sudo make install diff --git a/Assembler/MaxAs/LICENSE b/Assembler/MaxAs/LICENSE new file mode 100644 index 0000000..6c28fad --- /dev/null +++ b/Assembler/MaxAs/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/Assembler/MaxAs/MANIFEST b/Assembler/MaxAs/MANIFEST new file mode 100644 index 0000000..a25084c --- /dev/null +++ b/Assembler/MaxAs/MANIFEST @@ -0,0 +1,38 @@ +bin/maxas.pl +Changes +lib/MaxAs/Cubin.pm +lib/MaxAs/MaxAs.pm +lib/MaxAs/MaxAsGrammar.pm +LICENSE +Makefile.PL +MANIFEST +microbench/microbench.cpp +microbench/microbench.cu +microbench/microbench.sass +microbench/shared.pl +microbench/shared_lds.sass +microbench/shared_sts16.sass +microbench/throughput.pl +microbench/throughput.sass +microbench/throughput2.pl +microbench/throughput2.sass +microbench/throughput3.pl +microbench/throughput4.pl +microbench/throughput5.pl +microbench/xmad.pl +microbench/xmad2.sass +README.md +sgemm/batched_gemm.xlsx +sgemm/cublas_sgemm.ptx +sgemm/sgemm.cpp +sgemm/sgemm.cu +sgemm/sgemm.pl +sgemm/sgemm.sln +sgemm/sgemm.vcxproj +sgemm/sgemm128.sass +sgemm/sgemm64.sass +sgemm/sgemm_final_128.sass +sgemm/sgemm_final_64.sass +sgemm/sgemm_pre_128.sass +sgemm/sgemm_pre_64.sass +t/MaxAs-MaxAs.t diff --git a/Assembler/MaxAs/Makefile b/Assembler/MaxAs/Makefile new file mode 100644 index 0000000..79e0de9 --- /dev/null +++ b/Assembler/MaxAs/Makefile @@ -0,0 +1,840 @@ +# This Makefile is for the MaxAs::MaxAs extension to perl. +# +# It was generated automatically by MakeMaker version +# 6.55_02 (Revision: 65502) from the contents of +# Makefile.PL. Don't edit this file, edit Makefile.PL instead. +# +# ANY CHANGES MADE HERE WILL BE LOST! +# +# MakeMaker ARGV: () +# + +# MakeMaker Parameters: + +# ABSTRACT_FROM => q[lib/MaxAs/MaxAs.pm] +# AUTHOR => q[Scott Gray ] +# BUILD_REQUIRES => { } +# EXE_FILES => [q[bin/maxas.pl]] +# LICENSE => q[MIT] +# NAME => q[MaxAs::MaxAs] +# PREREQ_PM => { Data::Dumper=>q[2.145], Carp=>q[1.29] } +# VERSION_FROM => q[lib/MaxAs/MaxAs.pm] + +# --- MakeMaker post_initialize section: + + +# --- MakeMaker const_config section: + +# These definitions are from config.sh (via /usr/lib64/perl5/Config.pm). +# They may have been overridden via Makefile.PL or on the command line. +AR = ar +CC = gcc +CCCDLFLAGS = -fPIC +CCDLFLAGS = -Wl,-E -Wl,-rpath,/usr/lib64/perl5/CORE +DLEXT = so +DLSRC = dl_dlopen.xs +EXE_EXT = +FULL_AR = /usr/bin/ar +LD = gcc +LDDLFLAGS = -shared -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic +LDFLAGS = -fstack-protector +LIBC = +LIB_EXT = .a +OBJ_EXT = .o +OSNAME = linux +OSVERS = 2.6.18-308.1.1.el5 +RANLIB = : +SITELIBEXP = /usr/local/share/perl5 +SITEARCHEXP = /usr/local/lib64/perl5 +SO = so +VENDORARCHEXP = /usr/lib64/perl5/vendor_perl +VENDORLIBEXP = /usr/share/perl5/vendor_perl + + +# --- MakeMaker constants section: +AR_STATIC_ARGS = cr +DIRFILESEP = / +DFSEP = $(DIRFILESEP) +NAME = MaxAs::MaxAs +NAME_SYM = MaxAs_MaxAs +VERSION = 1.06 +VERSION_MACRO = VERSION +VERSION_SYM = 1_06 +DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\" +XS_VERSION = 1.06 +XS_VERSION_MACRO = XS_VERSION +XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\" +INST_ARCHLIB = blib/arch +INST_SCRIPT = blib/script +INST_BIN = blib/bin +INST_LIB = blib/lib +INST_MAN1DIR = blib/man1 +INST_MAN3DIR = blib/man3 +MAN1EXT = 1 +MAN3EXT = 3pm +INSTALLDIRS = site +DESTDIR = +PREFIX = $(SITEPREFIX) +PERLPREFIX = /usr +SITEPREFIX = /usr/local +VENDORPREFIX = /usr +INSTALLPRIVLIB = /usr/share/perl5 +DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB) +INSTALLSITELIB = /usr/local/share/perl5 +DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB) +INSTALLVENDORLIB = /usr/share/perl5/vendor_perl +DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB) +INSTALLARCHLIB = /usr/lib64/perl5 +DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB) +INSTALLSITEARCH = /usr/local/lib64/perl5 +DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH) +INSTALLVENDORARCH = /usr/lib64/perl5/vendor_perl +DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH) +INSTALLBIN = /usr/bin +DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN) +INSTALLSITEBIN = /usr/local/bin +DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN) +INSTALLVENDORBIN = /usr/bin +DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN) +INSTALLSCRIPT = /usr/bin +DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT) +INSTALLSITESCRIPT = /usr/local/bin +DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT) +INSTALLVENDORSCRIPT = /usr/bin +DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT) +INSTALLMAN1DIR = /usr/share/man/man1 +DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR) +INSTALLSITEMAN1DIR = /usr/local/share/man/man1 +DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR) +INSTALLVENDORMAN1DIR = /usr/share/man/man1 +DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR) +INSTALLMAN3DIR = /usr/share/man/man3 +DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR) +INSTALLSITEMAN3DIR = /usr/local/share/man/man3 +DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR) +INSTALLVENDORMAN3DIR = /usr/share/man/man3 +DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR) +PERL_LIB = /usr/share/perl5 +PERL_ARCHLIB = /usr/lib64/perl5 +LIBPERL_A = libperl.a +FIRST_MAKEFILE = Makefile +MAKEFILE_OLD = Makefile.old +MAKE_APERL_FILE = Makefile.aperl +PERLMAINCC = $(CC) +PERL_INC = /usr/lib64/perl5/CORE +PERL = /usr/bin/perl +FULLPERL = /usr/bin/perl +ABSPERL = $(PERL) +PERLRUN = $(PERL) +FULLPERLRUN = $(FULLPERL) +ABSPERLRUN = $(ABSPERL) +PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +PERL_CORE = 0 +PERM_DIR = 755 +PERM_RW = 644 +PERM_RWX = 755 + +MAKEMAKER = /usr/share/perl5/ExtUtils/MakeMaker.pm +MM_VERSION = 6.55_02 +MM_REVISION = 65502 + +# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle). +# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle) +# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar) +# DLBASE = Basename part of dynamic library. May be just equal BASEEXT. +MAKE = make +FULLEXT = MaxAs/MaxAs +BASEEXT = MaxAs +PARENT_NAME = MaxAs +DLBASE = $(BASEEXT) +VERSION_FROM = lib/MaxAs/MaxAs.pm +OBJECT = +LDFROM = $(OBJECT) +LINKTYPE = dynamic +BOOTDEP = + +# Handy lists of source code files: +XS_FILES = +C_FILES = +O_FILES = +H_FILES = +MAN1PODS = +MAN3PODS = lib/MaxAs/MaxAs.pm + +# Where is the Config information that we are using/depend on +CONFIGDEP = $(PERL_ARCHLIB)$(DFSEP)Config.pm $(PERL_INC)$(DFSEP)config.h + +# Where to build things +INST_LIBDIR = $(INST_LIB)/MaxAs +INST_ARCHLIBDIR = $(INST_ARCHLIB)/MaxAs + +INST_AUTODIR = $(INST_LIB)/auto/$(FULLEXT) +INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT) + +INST_STATIC = +INST_DYNAMIC = +INST_BOOT = + +# Extra linker info +EXPORT_LIST = +PERL_ARCHIVE = +PERL_ARCHIVE_AFTER = + + +TO_INST_PM = lib/MaxAs/Cubin.pm \ + lib/MaxAs/MaxAs.pm \ + lib/MaxAs/MaxAsGrammar.pm + +PM_TO_BLIB = lib/MaxAs/MaxAs.pm \ + blib/lib/MaxAs/MaxAs.pm \ + lib/MaxAs/Cubin.pm \ + blib/lib/MaxAs/Cubin.pm \ + lib/MaxAs/MaxAsGrammar.pm \ + blib/lib/MaxAs/MaxAsGrammar.pm + + +# --- MakeMaker platform_constants section: +MM_Unix_VERSION = 6.55_02 +PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc + + +# --- MakeMaker tool_autosplit section: +# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto +AUTOSPLITFILE = $(ABSPERLRUN) -e 'use AutoSplit; autosplit($$ARGV[0], $$ARGV[1], 0, 1, 1)' -- + + + +# --- MakeMaker tool_xsubpp section: + + +# --- MakeMaker tools_other section: +SHELL = /bin/sh +CHMOD = chmod +CP = cp +MV = mv +NOOP = $(TRUE) +NOECHO = @ +RM_F = rm -f +RM_RF = rm -rf +TEST_F = test -f +TOUCH = touch +UMASK_NULL = umask 0 +DEV_NULL = > /dev/null 2>&1 +MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' -- +EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' -- +FALSE = false +TRUE = true +ECHO = echo +ECHO_N = echo -n +UNINST = 0 +VERBINST = 0 +MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' -- +DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' -- +UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' -- +WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' -- +MACROSTART = +MACROEND = +USEMAKEFILE = -f +FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' -- + + +# --- MakeMaker makemakerdflt section: +makemakerdflt : all + $(NOECHO) $(NOOP) + + +# --- MakeMaker dist section: +TAR = tar +TARFLAGS = cvf +ZIP = zip +ZIPFLAGS = -r +COMPRESS = gzip --best +SUFFIX = .gz +SHAR = shar +PREOP = $(NOECHO) $(NOOP) +POSTOP = $(NOECHO) $(NOOP) +TO_UNIX = $(NOECHO) $(NOOP) +CI = ci -u +RCS_LABEL = rcs -Nv$(VERSION_SYM): -q +DIST_CP = best +DIST_DEFAULT = tardist +DISTNAME = MaxAs-MaxAs +DISTVNAME = MaxAs-MaxAs-1.06 + + +# --- MakeMaker macro section: + + +# --- MakeMaker depend section: + + +# --- MakeMaker cflags section: + + +# --- MakeMaker const_loadlibs section: + + +# --- MakeMaker const_cccmd section: + + +# --- MakeMaker post_constants section: + + +# --- MakeMaker pasthru section: + +PASTHRU = LIBPERL_A="$(LIBPERL_A)"\ + LINKTYPE="$(LINKTYPE)"\ + PREFIX="$(PREFIX)" + + +# --- MakeMaker special_targets section: +.SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT) + +.PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir + + + +# --- MakeMaker c_o section: + + +# --- MakeMaker xs_c section: + + +# --- MakeMaker xs_o section: + + +# --- MakeMaker top_targets section: +all :: pure_all manifypods + $(NOECHO) $(NOOP) + + +pure_all :: config pm_to_blib subdirs linkext + $(NOECHO) $(NOOP) + +subdirs :: $(MYEXTLIB) + $(NOECHO) $(NOOP) + +config :: $(FIRST_MAKEFILE) blibdirs + $(NOECHO) $(NOOP) + +help : + perldoc ExtUtils::MakeMaker + + +# --- MakeMaker blibdirs section: +blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists + $(NOECHO) $(NOOP) + +# Backwards compat with 6.18 through 6.25 +blibdirs.ts : blibdirs + $(NOECHO) $(NOOP) + +$(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_LIBDIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR) + $(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists + +$(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_ARCHLIB) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB) + $(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists + +$(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_AUTODIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR) + $(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists + +$(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR) + $(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists + +$(INST_BIN)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_BIN) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN) + $(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists + +$(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_SCRIPT) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT) + $(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists + +$(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_MAN1DIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR) + $(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists + +$(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_MAN3DIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR) + $(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists + + + +# --- MakeMaker linkext section: + +linkext :: $(LINKTYPE) + $(NOECHO) $(NOOP) + + +# --- MakeMaker dlsyms section: + + +# --- MakeMaker dynamic section: + +dynamic :: $(FIRST_MAKEFILE) $(INST_DYNAMIC) $(INST_BOOT) + $(NOECHO) $(NOOP) + + +# --- MakeMaker dynamic_bs section: + +BOOTSTRAP = + + +# --- MakeMaker dynamic_lib section: + + +# --- MakeMaker static section: + +## $(INST_PM) has been moved to the all: target. +## It remains here for awhile to allow for old usage: "make static" +static :: $(FIRST_MAKEFILE) $(INST_STATIC) + $(NOECHO) $(NOOP) + + +# --- MakeMaker static_lib section: + + +# --- MakeMaker manifypods section: + +POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--" +POD2MAN = $(POD2MAN_EXE) + + +manifypods : pure_all \ + lib/MaxAs/MaxAs.pm + $(NOECHO) $(POD2MAN) --section=3 --perm_rw=$(PERM_RW) \ + lib/MaxAs/MaxAs.pm $(INST_MAN3DIR)/MaxAs::MaxAs.$(MAN3EXT) + + + + +# --- MakeMaker processPL section: + + +# --- MakeMaker installbin section: + +EXE_FILES = bin/maxas.pl + +pure_all :: $(INST_SCRIPT)/maxas.pl + $(NOECHO) $(NOOP) + +realclean :: + $(RM_F) \ + $(INST_SCRIPT)/maxas.pl + +$(INST_SCRIPT)/maxas.pl : bin/maxas.pl $(FIRST_MAKEFILE) $(INST_SCRIPT)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists + $(NOECHO) $(RM_F) $(INST_SCRIPT)/maxas.pl + $(CP) bin/maxas.pl $(INST_SCRIPT)/maxas.pl + $(FIXIN) $(INST_SCRIPT)/maxas.pl + -$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_SCRIPT)/maxas.pl + + + +# --- MakeMaker subdirs section: + +# none + +# --- MakeMaker clean_subdirs section: +clean_subdirs : + $(NOECHO) $(NOOP) + + +# --- MakeMaker clean section: + +# Delete temporary files but do not touch installed files. We don't delete +# the Makefile here so a later make realclean still has a makefile to use. + +clean :: clean_subdirs + - $(RM_F) \ + *$(LIB_EXT) core \ + core.[0-9] $(INST_ARCHAUTODIR)/extralibs.all \ + core.[0-9][0-9] $(BASEEXT).bso \ + pm_to_blib.ts core.[0-9][0-9][0-9][0-9] \ + $(BASEEXT).x $(BOOTSTRAP) \ + perl$(EXE_EXT) tmon.out \ + *$(OBJ_EXT) pm_to_blib \ + $(INST_ARCHAUTODIR)/extralibs.ld blibdirs.ts \ + core.[0-9][0-9][0-9][0-9][0-9] *perl.core \ + core.*perl.*.? $(MAKE_APERL_FILE) \ + perl $(BASEEXT).def \ + core.[0-9][0-9][0-9] mon.out \ + lib$(BASEEXT).def perlmain.c \ + perl.exe so_locations \ + $(BASEEXT).exp + - $(RM_RF) \ + blib + - $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL) + + +# --- MakeMaker realclean_subdirs section: +realclean_subdirs : + $(NOECHO) $(NOOP) + + +# --- MakeMaker realclean section: +# Delete temporary files (via clean) and also delete dist files +realclean purge :: clean realclean_subdirs + - $(RM_F) \ + $(MAKEFILE_OLD) $(FIRST_MAKEFILE) + - $(RM_RF) \ + $(DISTVNAME) + + +# --- MakeMaker metafile section: +metafile : create_distdir + $(NOECHO) $(ECHO) Generating META.yml + $(NOECHO) $(ECHO) '--- #YAML:1.0' > META_new.yml + $(NOECHO) $(ECHO) 'name: MaxAs-MaxAs' >> META_new.yml + $(NOECHO) $(ECHO) 'version: 1.06' >> META_new.yml + $(NOECHO) $(ECHO) 'abstract: Assembler for NVIDIA Maxwell architecture' >> META_new.yml + $(NOECHO) $(ECHO) 'author:' >> META_new.yml + $(NOECHO) $(ECHO) ' - Scott Gray ' >> META_new.yml + $(NOECHO) $(ECHO) 'license: MIT' >> META_new.yml + $(NOECHO) $(ECHO) 'distribution_type: module' >> META_new.yml + $(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml + $(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml + $(NOECHO) $(ECHO) 'requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' Carp: 1.29' >> META_new.yml + $(NOECHO) $(ECHO) ' Data::Dumper: 2.145' >> META_new.yml + $(NOECHO) $(ECHO) 'no_index:' >> META_new.yml + $(NOECHO) $(ECHO) ' directory:' >> META_new.yml + $(NOECHO) $(ECHO) ' - t' >> META_new.yml + $(NOECHO) $(ECHO) ' - inc' >> META_new.yml + $(NOECHO) $(ECHO) 'generated_by: ExtUtils::MakeMaker version 6.55_02' >> META_new.yml + $(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml + $(NOECHO) $(ECHO) ' url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml + $(NOECHO) $(ECHO) ' version: 1.4' >> META_new.yml + -$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml + + +# --- MakeMaker signature section: +signature : + cpansign -s + + +# --- MakeMaker dist_basics section: +distclean :: realclean distcheck + $(NOECHO) $(NOOP) + +distcheck : + $(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck + +skipcheck : + $(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck + +manifest : + $(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest + +veryclean : realclean + $(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old + + + +# --- MakeMaker dist_core section: + +dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE) + $(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \ + -e ' if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' -- + +tardist : $(DISTVNAME).tar$(SUFFIX) + $(NOECHO) $(NOOP) + +uutardist : $(DISTVNAME).tar$(SUFFIX) + uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu + +$(DISTVNAME).tar$(SUFFIX) : distdir + $(PREOP) + $(TO_UNIX) + $(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME) + $(RM_RF) $(DISTVNAME) + $(COMPRESS) $(DISTVNAME).tar + $(POSTOP) + +zipdist : $(DISTVNAME).zip + $(NOECHO) $(NOOP) + +$(DISTVNAME).zip : distdir + $(PREOP) + $(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME) + $(RM_RF) $(DISTVNAME) + $(POSTOP) + +shdist : distdir + $(PREOP) + $(SHAR) $(DISTVNAME) > $(DISTVNAME).shar + $(RM_RF) $(DISTVNAME) + $(POSTOP) + + +# --- MakeMaker distdir section: +create_distdir : + $(RM_RF) $(DISTVNAME) + $(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \ + -e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');" + +distdir : create_distdir distmeta + $(NOECHO) $(NOOP) + + + +# --- MakeMaker dist_test section: +disttest : distdir + cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL + cd $(DISTVNAME) && $(MAKE) $(PASTHRU) + cd $(DISTVNAME) && $(MAKE) test $(PASTHRU) + + + +# --- MakeMaker dist_ci section: + +ci : + $(PERLRUN) "-MExtUtils::Manifest=maniread" \ + -e "@all = keys %{ maniread() };" \ + -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \ + -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});" + + +# --- MakeMaker distmeta section: +distmeta : create_distdir metafile + $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{META.yml} => q{Module meta-data (added by MakeMaker)}}) } ' \ + -e ' or print "Could not add META.yml to MANIFEST: $${'\''@'\''}\n"' -- + + + +# --- MakeMaker distsignature section: +distsignature : create_distdir + $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) } ' \ + -e ' or print "Could not add SIGNATURE to MANIFEST: $${'\''@'\''}\n"' -- + $(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE + cd $(DISTVNAME) && cpansign -s + + + +# --- MakeMaker install section: + +install :: pure_install doc_install + $(NOECHO) $(NOOP) + +install_perl :: pure_perl_install doc_perl_install + $(NOECHO) $(NOOP) + +install_site :: pure_site_install doc_site_install + $(NOECHO) $(NOOP) + +install_vendor :: pure_vendor_install doc_vendor_install + $(NOECHO) $(NOOP) + +pure_install :: pure_$(INSTALLDIRS)_install + $(NOECHO) $(NOOP) + +doc_install :: doc_$(INSTALLDIRS)_install + $(NOECHO) $(NOOP) + +pure__install : pure_site_install + $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site + +doc__install : doc_site_install + $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site + +pure_perl_install :: all + $(NOECHO) $(MOD_INSTALL) \ + read $(PERL_ARCHLIB)/auto/$(FULLEXT)/.packlist \ + write $(DESTINSTALLARCHLIB)/auto/$(FULLEXT)/.packlist \ + $(INST_LIB) $(DESTINSTALLPRIVLIB) \ + $(INST_ARCHLIB) $(DESTINSTALLARCHLIB) \ + $(INST_BIN) $(DESTINSTALLBIN) \ + $(INST_SCRIPT) $(DESTINSTALLSCRIPT) \ + $(INST_MAN1DIR) $(DESTINSTALLMAN1DIR) \ + $(INST_MAN3DIR) $(DESTINSTALLMAN3DIR) + $(NOECHO) $(WARN_IF_OLD_PACKLIST) \ + $(SITEARCHEXP)/auto/$(FULLEXT) + + +pure_site_install :: all + $(NOECHO) $(MOD_INSTALL) \ + read $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist \ + write $(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist \ + $(INST_LIB) $(DESTINSTALLSITELIB) \ + $(INST_ARCHLIB) $(DESTINSTALLSITEARCH) \ + $(INST_BIN) $(DESTINSTALLSITEBIN) \ + $(INST_SCRIPT) $(DESTINSTALLSITESCRIPT) \ + $(INST_MAN1DIR) $(DESTINSTALLSITEMAN1DIR) \ + $(INST_MAN3DIR) $(DESTINSTALLSITEMAN3DIR) + $(NOECHO) $(WARN_IF_OLD_PACKLIST) \ + $(PERL_ARCHLIB)/auto/$(FULLEXT) + +pure_vendor_install :: all + $(NOECHO) $(MOD_INSTALL) \ + read $(VENDORARCHEXP)/auto/$(FULLEXT)/.packlist \ + write $(DESTINSTALLVENDORARCH)/auto/$(FULLEXT)/.packlist \ + $(INST_LIB) $(DESTINSTALLVENDORLIB) \ + $(INST_ARCHLIB) $(DESTINSTALLVENDORARCH) \ + $(INST_BIN) $(DESTINSTALLVENDORBIN) \ + $(INST_SCRIPT) $(DESTINSTALLVENDORSCRIPT) \ + $(INST_MAN1DIR) $(DESTINSTALLVENDORMAN1DIR) \ + $(INST_MAN3DIR) $(DESTINSTALLVENDORMAN3DIR) + +doc_perl_install :: all + $(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod + -$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB) + -$(NOECHO) $(DOC_INSTALL) \ + "Module" "$(NAME)" \ + "installed into" "$(INSTALLPRIVLIB)" \ + LINKTYPE "$(LINKTYPE)" \ + VERSION "$(VERSION)" \ + EXE_FILES "$(EXE_FILES)" \ + >> $(DESTINSTALLARCHLIB)/perllocal.pod + +doc_site_install :: all + $(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod + -$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB) + -$(NOECHO) $(DOC_INSTALL) \ + "Module" "$(NAME)" \ + "installed into" "$(INSTALLSITELIB)" \ + LINKTYPE "$(LINKTYPE)" \ + VERSION "$(VERSION)" \ + EXE_FILES "$(EXE_FILES)" \ + >> $(DESTINSTALLARCHLIB)/perllocal.pod + +doc_vendor_install :: all + $(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod + -$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB) + -$(NOECHO) $(DOC_INSTALL) \ + "Module" "$(NAME)" \ + "installed into" "$(INSTALLVENDORLIB)" \ + LINKTYPE "$(LINKTYPE)" \ + VERSION "$(VERSION)" \ + EXE_FILES "$(EXE_FILES)" \ + >> $(DESTINSTALLARCHLIB)/perllocal.pod + + +uninstall :: uninstall_from_$(INSTALLDIRS)dirs + $(NOECHO) $(NOOP) + +uninstall_from_perldirs :: + $(NOECHO) $(UNINSTALL) $(PERL_ARCHLIB)/auto/$(FULLEXT)/.packlist + +uninstall_from_sitedirs :: + $(NOECHO) $(UNINSTALL) $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist + +uninstall_from_vendordirs :: + $(NOECHO) $(UNINSTALL) $(VENDORARCHEXP)/auto/$(FULLEXT)/.packlist + + +# --- MakeMaker force section: +# Phony target to force checking subdirectories. +FORCE : + $(NOECHO) $(NOOP) + + +# --- MakeMaker perldepend section: + + +# --- MakeMaker makefile section: +# We take a very conservative approach here, but it's worth it. +# We move Makefile to Makefile.old here to avoid gnu make looping. +$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP) + $(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?" + $(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..." + -$(NOECHO) $(RM_F) $(MAKEFILE_OLD) + -$(NOECHO) $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) + - $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL) + $(PERLRUN) Makefile.PL + $(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <==" + $(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command. <==" + $(FALSE) + + + +# --- MakeMaker staticmake section: + +# --- MakeMaker makeaperl section --- +MAP_TARGET = perl +FULLPERL = /usr/bin/perl + +$(MAP_TARGET) :: static $(MAKE_APERL_FILE) + $(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@ + +$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib + $(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET) + $(NOECHO) $(PERLRUNINST) \ + Makefile.PL DIR= \ + MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \ + MAKEAPERL=1 NORECURS=1 CCCDLFLAGS= + + +# --- MakeMaker test section: + +TEST_VERBOSE=0 +TEST_TYPE=test_$(LINKTYPE) +TEST_FILE = test.pl +TEST_FILES = t/*.t +TESTDB_SW = -d + +testdb :: testdb_$(LINKTYPE) + +test :: $(TEST_TYPE) subdirs-test + +subdirs-test :: + $(NOECHO) $(NOOP) + + +test_dynamic :: pure_all + PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-MExtUtils::Command::MM" "-e" "test_harness($(TEST_VERBOSE), '$(INST_LIB)', '$(INST_ARCHLIB)')" $(TEST_FILES) + +testdb_dynamic :: pure_all + PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE) + +test_ : test_dynamic + +test_static :: test_dynamic +testdb_static :: testdb_dynamic + + +# --- MakeMaker ppd section: +# Creates a PPD (Perl Package Description) for a binary distribution. +ppd : + $(NOECHO) $(ECHO) '' > $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' Assembler for NVIDIA Maxwell architecture' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' Scott Gray <sgray@nervanasys.com>' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) '' >> $(DISTNAME).ppd + + +# --- MakeMaker pm_to_blib section: + +pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM) + $(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \ + lib/MaxAs/MaxAs.pm blib/lib/MaxAs/MaxAs.pm \ + lib/MaxAs/Cubin.pm blib/lib/MaxAs/Cubin.pm \ + lib/MaxAs/MaxAsGrammar.pm blib/lib/MaxAs/MaxAsGrammar.pm + $(NOECHO) $(TOUCH) pm_to_blib + + +# --- MakeMaker selfdocument section: + + +# --- MakeMaker postamble section: + + +# End. diff --git a/Assembler/MaxAs/Makefile.PL b/Assembler/MaxAs/Makefile.PL new file mode 100644 index 0000000..4be8ccf --- /dev/null +++ b/Assembler/MaxAs/Makefile.PL @@ -0,0 +1,14 @@ +require 5.10.0; +use ExtUtils::MakeMaker; +# See lib/ExtUtils/MakeMaker.pm for details of how to influence +# the contents of the Makefile that is written. +WriteMakefile( + NAME => 'MaxAs::MaxAs', + VERSION_FROM => 'lib/MaxAs/MaxAs.pm', # finds $VERSION + EXE_FILES => ['bin/maxas.pl'], + PREREQ_PM => {Carp => 1.29, Data::Dumper => 2.145}, + LICENSE => 'MIT', + ($] >= 5.005 ? ## Add these new keywords supported since 5.005 + (ABSTRACT_FROM => 'lib/MaxAs/MaxAs.pm', # retrieve abstract from module + AUTHOR => 'Scott Gray ') : ()), +); diff --git a/Assembler/MaxAs/README.md b/Assembler/MaxAs/README.md new file mode 100644 index 0000000..318aba8 --- /dev/null +++ b/Assembler/MaxAs/README.md @@ -0,0 +1,28 @@ +# MaxAs +Assembler for NVIDIA Maxwell architecture + +To install (system-wide): + + sudo cpanm git://github.com/NervanaSystems/maxas.git + +or + + perl Makefile.PL + make + sudo make install + + +See wiki pages for more information: + +- [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction) +- [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started) +- [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes) +- [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM) + +Related work with lots of additional shader assembly (sass) examples: + +- [NervanaGPU](https://github.com/NervanaSystems/nervanagpu) + +This project is released under the [MIT License](http://opensource.org/licenses/MIT). + +-- Scott Gray diff --git a/Assembler/MaxAs/bin/maxas.pl b/Assembler/MaxAs/bin/maxas.pl new file mode 100755 index 0000000..55e4241 --- /dev/null +++ b/Assembler/MaxAs/bin/maxas.pl @@ -0,0 +1,314 @@ +#!/usr/bin/perl +use strict; +use MaxAs::Cubin; +use MaxAs::MaxAs; +use Data::Dumper; +use File::Spec; + +require 5.10.0; + +$Data::Dumper::Sortkeys = 1; + +my $mode = shift; + +# List cubin contents +if ($mode =~ /^\-?\-l/i) +{ + my $cubinFile = shift or usage(); + + my $cubin = MaxAs::Cubin->new($cubinFile); + + my $arch = $cubin->arch; + my $class = $cubin->class; + my $asize = $cubin->address_size; + my $kernels = $cubin->listKernels; + my $symbols = $cubin->listSymbols; + + printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize; + + foreach my $ker (sort keys %$kernels) + { + printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)}; + } + foreach my $sym (sort keys %$symbols) + { + printf "Symbol: %s\n", $sym; + } +} +# Test that the assembler can reproduce the op codes this cubin or sass contains +elsif ($mode =~ /^\-?\-t/i) +{ + my $reg = shift if $ARGV[0] =~ /^\-?\-r/i; + my $all = shift if $ARGV[0] =~ /^\-?\-a/i; + my $file = shift or usage(); + my $fh; + # sass file + if (-T $file) + { + open $fh, $file or die "$file: $!"; + } + # cubin file + else + { + my $cubin = MaxAs::Cubin->new($file); + my $arch = $cubin->arch; + + open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!"; + my $first = <$fh>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + } + exit(MaxAs::MaxAs::Test($fh, $reg, $all) ? 1 : 0); +} +# Extract an asm file containing the desired kernel +elsif ($mode =~ /^\-?\-e/i) +{ + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $cubinFile = shift or usage(); + my $asmFile = shift; + my $cubin = MaxAs::Cubin->new($cubinFile); + my $arch = $cubin->arch; + my $kernels = $cubin->listKernels; + + #default the kernel name if not specified. + $kernelName ||= (sort keys %$kernels)[0]; + + my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName"; + + open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_50 -sass -fun $kernelName $cubinFile: $!"; + my $first = <$in>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n"; + + print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt)); + + print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n"; + + print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params}; + + print $out "#\n# Instructions:\n\n"; + + MaxAs::MaxAs::Extract($in, $out, $kernel->{Params}); + + close $out if $asmFile; + close $in; +} +# Extract a kernel from a sass dump +elsif ($mode =~ /^\-?\-s/i) +{ + my $sassFile = shift or usage(); + my $asmFile = shift; + + open my $in, $sassFile or die "$sassFile: $!"; + + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + MaxAs::MaxAs::Extract($in, $out, []); + + close $out if $asmFile; + close $in; +} +# Insert the kernel asm back into the cubin: +elsif ($mode =~ /^\-?\-i/i) +{ + my $nowarn; + if ($ARGV[0] =~ /^\-?\-w/i) + { + $nowarn = shift; + } + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $noReuse = shift if $ARGV[0] =~ /^\-?\-n/i; + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';" + } + + my $asmFile = shift or usage(); + my $cubinFile = shift or usage(); + my $newCubin = shift || $cubinFile; + + my $file; + if (open my $fh, $asmFile) + { + local $/; + $file = <$fh>; + close $fh; + } + else { die "$asmFile: $!" } + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + # extract the kernel name from the file + ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName; + die "asm file missing kernel name or is badly formatted" unless $kernelName; + + my $kernel = MaxAs::MaxAs::Assemble($file, $include, !$noReuse, $nowarn); + + my $cubin = MaxAs::Cubin->new($cubinFile); + $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName"; + + $cubin->modifyKernel(%$kernel); + + $cubin->write($newCubin); + + printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n", + @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)}; + +} +# Preprocessing: +elsif ($mode =~ /^\-?\-p/i) +{ + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"; + } + my $debug = shift if $ARGV[0] =~ /^\-?\-d/i; + my $asmFile = shift or usage(); + my $asmFile2 = shift; + + die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2; + + open my $fh, $asmFile or die "$asmFile: $!"; + local $/; + my $file = <$fh>; + close $fh; + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + if ($asmFile2) + { + open $fh, ">$asmFile2" or die "$asmFile2: $!"; + } + else + { + $fh = \*STDOUT; + } + print $fh MaxAs::MaxAs::Preprocess($file, $include, $debug); + close $fh; +} +#Analyzing +elsif ($mode =~ /^\-?\-a/i) +{ + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"; + } + my $analyze = shift if $ARGV[0] =~ /^\-?\-analyze/i; + my $config = shift or usage(); + my $asmFile = shift or usage(); + + open my $fh, $asmFile or die "$asmFile: $!"; + local $/; + my $file = <$fh>; + close $fh; + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + MaxAs::MaxAs::Occupancy($config); + MaxAs::MaxAs::Analyze($file, $include); +} +# get version information +elsif ($mode =~ /^\-?\-v/i) +{ + print "$MaxAs::MaxAs::VERSION\n"; +} +else +{ + print "$mode\n"; + usage(); +} + +exit(0); + +sub usage +{ + print < + + Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes. + Also useful for extending the missing grammar rules. Defaults to only showing failures without --all. + With the --reg flag it will show register bank conflicts not hidden by reuse flags. + + maxas.pl --test|-t [--reg|-r] [--all|-a] + + Extract a single kernel into an asm file from a cubin. + Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin. + + maxas.pl --extract|-e [--kernel|-k kernel_name] [asm_file] + + Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes. + Include the debug flag to print out detailed scheduler info. + + maxas.pl --pre|-p [--debug|-d] [new_asm_file] + + Insert the kernel asm back into the cubin. Overwrite existing or create new cubin. + Optionally you can skip register reuse flag auto insertion. This allows you to observe + performance without any reuse or you can use it to set the flags manually in your sass. + + maxas.pl --insert|-i [--noreuse|-n] [new_cubin_file] + + Analyze each blocks in the assembly codes. Specify each instruction's efficiency, predict a block's + running cycles, and point out codes bottlenecks. + + maxas.pl --analyze|-a [result_file] + + Display version information and exit: + + maxas.pl --version|-v + +EOF + exit(1); +} + +__END__ diff --git a/Assembler/MaxAs/blib/arch/.exists b/Assembler/MaxAs/blib/arch/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/blib/arch/auto/MaxAs/MaxAs/.exists b/Assembler/MaxAs/blib/arch/auto/MaxAs/MaxAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/blib/bin/.exists b/Assembler/MaxAs/blib/bin/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/blib/lib/MaxAs/.exists b/Assembler/MaxAs/blib/lib/MaxAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/blib/lib/MaxAs/Cubin.pm b/Assembler/MaxAs/blib/lib/MaxAs/Cubin.pm new file mode 100644 index 0000000..5900958 --- /dev/null +++ b/Assembler/MaxAs/blib/lib/MaxAs/Cubin.pm @@ -0,0 +1,684 @@ +package MaxAs::Cubin; + +use strict; +use Data::Dumper; + +my @Elf32_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + L entry + L phOffset + L shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf64_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + Q entry + Q phOffset + Q shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf32_PrgHdr = qw( + L type + L offset + L vaddr + L paddr + L fileSize + L memSize + L flags + L align +); +my @Elf64_PrgHdr = qw( + L type + L flags + Q offset + Q vaddr + Q paddr + Q fileSize + Q memSize + Q align +); +my @Elf32_SecHdr = qw( + L name + L type + L flags + L addr + L offset + L size + L link + L info + L align + L entSize +); +my @Elf64_SecHdr = qw( + L name + L type + Q flags + Q addr + Q offset + Q size + L link + L info + Q align + Q entSize +); +my @Elf32_SymEnt = qw( + L name + L value + L size + C info + C other + S shIndx +); +my @Elf64_SymEnt = qw( + L name + C info + C other + S shIndx + Q value + Q size +); +my @symBind = qw(LOCAL GLOBAL WEAK); + +# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C) +my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC); + +$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr; +$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr; +$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr; +$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt; + +$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr; +$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr; +$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr; +$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt; + +$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr ]; +$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ]; +$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ]; +$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ]; + +$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr ]; +$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ]; +$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ]; +$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ]; + +# Load a cubin ELF file +sub new +{ + my ($package, $file) = @_; + + my $cubin = bless { fileName => $file }, $package; + + open my $fh, $file or die "$file: $!"; + binmode($fh); + + # Read in assuming 32 bit header + my $data; + read $fh, $data, 0x34; + my $elfHdr = $cubin->{elfHdr} = {}; + @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data; + + # 1: 32bit, 2: 64bit + my $class = $elfHdr->{fileClass}; + + # re-read in with 64 bit header if needed + if ($class == 2) + { + seek $fh, 0, 0; + read $fh, $data, 0x46; + @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data; + + $cubin->{Class} = 64; + } + else + { + $cubin->{Class} = 32; + } + + # verify sm_50 cubin + $cubin->{Arch} = $elfHdr->{flags} & 0xFF; + die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50; + + $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32; + + # Read in Program Headers + seek $fh, $elfHdr->{phOffset}, 0; + foreach (1 .. $elfHdr->{phNum}) + { + read $fh, $data, $elfHdr->{phEntSize}; + + my %prgHdr = (Indx => $_ - 1); + @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data; + push @{$cubin->{prgHdrs}}, \%prgHdr; + } + + # Read in Section Headers + seek $fh, $elfHdr->{shOffset}, 0; + foreach (1 .. $elfHdr->{shNum}) + { + read $fh, $data, $elfHdr->{shEntSize}; + + my %secHdr = (Indx => $_ - 1); + @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data; + push @{$cubin->{secHdrs}}, \%secHdr; + } + + # Read in Section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $data = ''; + # Skip sections with no data (type NULL or NOBITS) + if ($secHdr->{size} && $secHdr->{type} != 8) + { + seek $fh, $secHdr->{offset}, 0; + read $fh, $data, $secHdr->{size}; + } + # Convert string tables to maps + if ($secHdr->{type} == 3) # STRTAB + { + my $strTab = $secHdr->{StrTab} = {}; + my $indx = 0; + foreach my $str (split "\0", $data) + { + $strTab->{$indx} = $str; + $indx += 1 + length($str); + } + } + # Read in Symbol data + if ($secHdr->{type} == 2) # SYMTAB + { + my $offset = 0; + while ($offset < $secHdr->{size}) + { + my $symEnt = {}; + @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize}); + $offset += $secHdr->{entSize}; + + push @{$secHdr->{SymTab}}, $symEnt; + } + } + # Cache raw data for further processing and writing + $secHdr->{Data} = unpack 'H*', $data; + } + close $fh; + + # Update section headers with their names. Map names directly to headers. + my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab}; + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $secHdr->{Name} = $shStrTab->{$secHdr->{name}}; + $cubin->{$secHdr->{Name}} = $secHdr; + } + + # Update symbols with their names + # For the Global functions, extract kernel meta data + # Populate the kernel hash + my $strTab = $cubin->{'.strtab'}{StrTab}; + foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}}) + { + $symEnt->{Name} = $strTab->{$symEnt->{name}}; + + # Attach symbol to section + my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}]; + $secHdr->{SymbolEnt} = $symEnt; + + # Look for symbols tagged FUNC + if (($symEnt->{info} & 0x0f) == 0x02) + { + # Create a hash of kernels for output + my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr; + + # Extract local/global/weak binding info + $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4]; + + # Extract the kernel instructions + $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ]; + + # Extract the max barrier resource identifier used and add 1. Should be 0-16. + # If a register is used as a barrier resource id, then this value is the max of 16. + $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20; + + # Extract the number of allocated registers for this kernel. + $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24; + + # Extract the size of shared memory this kernel uses. + my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"}; + $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0; + + # Attach constant0 section + $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"}; + + # Extract the kernel parameter data. + my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"}; + if ($paramSec) + { + # Extract raw param data + my @data = unpack "L*", pack "H*", $paramSec->{Data}; + + $paramSec->{ParamData} = \@data; + $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ]; + + # Find the first param delimiter + my $idx = 0; + $idx++ while $idx < @data && $data[$idx] != 0x00080a04; + + my $first = $data[$idx+2] & 0xFFFF; + #my $size = $data[$idx+2] >> 16; + $idx += 4; + + my @params; + while ($idx < @data && $data[$idx] == 0x000c1704) + { + # Get the ordinal, offset, size and pointer alignment for each param + my $ord = $data[$idx+2] & 0xFFFF; + my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16); + my $psize = $data[$idx+3] >> 18; + my $align = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0; + unshift @params, "$ord:$offset:$psize:$align"; + $idx += 4; + } + my @staticParams = @data[0 .. ($idx-1)]; + + my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize); + while ($idx < @data) + { + my $code = $data[$idx] & 0xffff; + my $size = $data[$idx] >> 16; + $idx++; + + # EIATTR_MAXREG_COUNT + if ($code == 0x1b03) + { + $maxregCount = $size; + } + # EIATTR_S2RCTAID_INSTR_OFFSETS + elsif ($code == 0x1d04) + { + while ($size > 0) + { + push @ctaidOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_EXIT_INSTR_OFFSETS + elsif ($code == 0x1c04) + { + while ($size > 0) + { + push @exitOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CTAIDZ_USED + elsif ($code == 0x0401) + { + $ctaidzUsed = 1; + } + # EIATTR_REQNTID + elsif ($code == 0x1004) + { + while ($size > 0) + { + push @reqntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_MAX_THREADS + elsif ($code == 0x0504) + { + while ($size > 0) + { + push @maxntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CRS_STACK_SIZE + elsif ($code == 0x1e04) + { + while ($size > 0) + { + push @stackSize, $data[$idx++]; + $size -= 4; + } + } + else + { + printf STDERR "Unknown Code 0x%02x (size:%d)\n", $code, $size; + } + } + $kernelSec->{Params} = \@params; + $kernelSec->{ParamCnt} = scalar @params; + + $paramSec->{StaticParams} = \@staticParams; + $paramSec->{MAXREG_COUNT} = $maxregCount; + $paramSec->{ExitOffsets} = \@exitOffsets; + $paramSec->{CTAIDOffsets} = \@ctaidOffsets; + $paramSec->{CTAIDZUsed} = $ctaidzUsed; + $paramSec->{REQNTID} = \@reqntid; + $paramSec->{MAXNTID} = \@maxntid; + $paramSec->{STACKSIZE} = \@stackSize; + } + # print Dumper($paramSec); + # exit(); + } + # Note GLOBALs found in this cubin + elsif (($symEnt->{info} & 0x10) == 0x10) + { + $cubin->{Symbols}{$symEnt->{Name}} = $symEnt; + } + } + + # print "phOffset: $elfHdr->{phOffset}\n"; + # print "shOffset: $elfHdr->{shOffset}\n"; + # foreach my $secHdr (@{$cubin->{secHdrs}}) + # { + # print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n"; + # } + # my $p = 0; + # foreach my $prgHdr (@{$cubin->{prgHdrs}}) + # { + # print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n"; + # $p++; + # } + # exit(); + + # print Dumper($cubin->{prgHdrs}); + # exit(); + return $cubin; +} +sub class +{ + return shift()->{Class}; +} +sub arch +{ + return shift()->{Arch}; +} +sub address_size +{ + return shift()->{AddressSize}; +} +sub listKernels +{ + return shift()->{Kernels}; +} +sub listSymbols +{ + return shift()->{Symbols}; +} +sub getKernel +{ + my ($cubin, $kernel) = @_; + return $cubin->{Kernels}{$kernel}; +} + +sub modifyKernel +{ + my ($cubin, %params) = @_; + + my $kernelSec = $params{Kernel}; + my $newReg = $params{RegCnt}; + my $newBar = $params{BarCnt}; + my $exitOffsets = $params{ExitOffsets}; + my $ctaidOffsets = $params{CTAIDOffsets}; + my $ctaidzUsed = $params{CTAIDZUsed}; + my $newData = $params{KernelData}; + my $newSize = @$newData * 8; + + die "255 register max" if $newReg > 255; + die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63; + die "16 is max barrier count" if $newBar > 16; + + my $paramSec = $kernelSec->{ParamSec}; + my $kernelName = $kernelSec->{SymbolEnt}{Name}; + my $maxregCount = $paramSec->{MAXREG_COUNT}; + my $stackSize = $paramSec->{STACKSIZE}; + + # update the kernel + $kernelSec->{KernelData} = $newData; + $kernelSec->{Data} = unpack "H*", pack "Q*", @$newData; + + if ($newReg != $kernelSec->{RegCnt}) + { + print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n"; + $kernelSec->{RegCnt} = $newReg; + $kernelSec->{info} &= ~0xff000000; + $kernelSec->{info} |= $newReg << 24; + } + if ($newBar != $kernelSec->{BarCnt}) + { + print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n"; + $kernelSec->{BarCnt} = $newBar; + $kernelSec->{flags} &= ~0x01f00000; + $kernelSec->{flags} |= $newBar << 20; + } + + my @paramData = @{$paramSec->{StaticParams}}; + + if (defined $maxregCount) + { + push @paramData, ($maxregCount << 16) | 0x1b03; + } + + my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets; + my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}}; + + if ($newCTAIDs ne $oldCTAIDs) + { + print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n"; + } + if (@$ctaidOffsets) + { + push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04; + push @paramData, @$ctaidOffsets; + } + + my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets; + my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}}; + + if ($newExits ne $oldExits) + { + print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n"; + } + if (@$exitOffsets) + { + push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04; + push @paramData, @$exitOffsets; + } + + if ($ctaidzUsed != $paramSec->{CTAIDZUsed}) + { + print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n"; + } + if ($ctaidzUsed) + { + push @paramData, 0x0401; + } + + if (@{$paramSec->{REQNTID}}) + { + push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004; + push @paramData, @{$paramSec->{REQNTID}}; + } + if (@{$paramSec->{MAXNTID}}) + { + push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504; + push @paramData, @{$paramSec->{MAXNTID}}; + } + + if (@$stackSize) + { + push @paramData, (scalar(@$stackSize) << 18) | 0x1e04; + push @paramData, @$stackSize; + } + + my $newParamSize = scalar(@paramData)*4; + $paramSec->{Data} = unpack "H*", pack "L*", @paramData; + if ($newParamSize != $paramSec->{size}) + { + print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n"; + $cubin->updateSize($paramSec, $newParamSize); + } + + if ($newSize != $kernelSec->{size}) + { + print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n"; + $cubin->updateSize($kernelSec, $newSize, 1); + } +} + +sub updateSize +{ + my ($cubin, $sec, $newSize, $updatePrgSize) = @_; + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # update section header + my $delta = $newSize - $sec->{size}; + $sec->{size} = $newSize; + + # update symtab section + if ($sec->{SymbolEnt}) + { + $sec->{SymbolEnt}{size} = $newSize; + my $symSection = $cubin->{'.symtab'}; + $symSection->{Data} = ''; + foreach my $symEnt (@{$symSection->{SymTab}}) + { + $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}}; + } + } + + my $pos = $elfHdr->{ehSize}; + my %sizeMap; + + # update section header offsets + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # skip first header + next if $secHdr->{align} == 0; + + # NOBITS data sections are size 0 + my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size}; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pos += $secHdr->{align} - $pad; + } + # map old offset to new + $sizeMap{$secHdr->{offset}} = $pos; + + # update offset + $secHdr->{offset} = $pos; + + # advance position by size + $pos += $size; + } + + # compute total section header size + my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset}; + + # map old offset to new + $sizeMap{$elfHdr->{shOffset}} = $pos; + $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize; + + $elfHdr->{shOffset} = $pos; + $elfHdr->{phOffset} = $pos + $shSize; + + # update program header offsets and sizes + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + # Not sure how best to adjust these so just assume they'll track other offsets. + $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}}; + + # If the kernel sizes changes, also update the associated ProgramHeader. + # Note that this size is the kernel size plus any constant section sizes. + if ($updatePrgSize && $prgHdr->{type} == 1 && + $sec->{offset} >= $prgHdr->{offset} && + $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta) + { + $prgHdr->{fileSize} += $delta; + $prgHdr->{memSize} += $delta; + } + } +} + +# Write out the cubin after modifying it. +sub write +{ + my ($cubin, $file) = @_; + + open my $fh, ">$file" or die "Error: could not open $file for writing: $!"; + binmode($fh); + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # write elf header + print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}}; + my $pos = $elfHdr->{ehSize}; + + # write section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # Skip NULL and NOBITS data sections + next if $secHdr->{size} == 0 || $secHdr->{type} == 8; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pad = $secHdr->{align} - $pad; + print $fh join '', "\0" x $pad; + $pos += $pad; + } + + print $fh pack 'H*', $secHdr->{Data}; + $pos += $secHdr->{size}; + } + + # write section headers + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}}; + } + + #write program headers + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}}; + } + close $fh; +} + +__END__ + diff --git a/Assembler/MaxAs/blib/lib/MaxAs/MaxAs.pm b/Assembler/MaxAs/blib/lib/MaxAs/MaxAs.pm new file mode 100644 index 0000000..f421cf3 --- /dev/null +++ b/Assembler/MaxAs/blib/lib/MaxAs/MaxAs.pm @@ -0,0 +1,1407 @@ +package MaxAs::MaxAs; + +require 5.10.0; + +use strict; +use Data::Dumper; +use MaxAs::MaxAsGrammar; +use File::Spec; +use Carp; + +our $VERSION = '1.06'; + +# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump +my %relOffset = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT); + +# these ops use absolute addresses +my %absOffset = map { $_ => 1 } qw(JCAL); + +my %jumpOp = (%relOffset, %absOffset); + +# These instructions use r0 but do not write to r0 +my %noDest = map { $_ => 1 } qw(ST STG STS STL RED); + +# Map register slots to reuse control codes +my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4); + +# Preprocess and Assemble a source file +sub Assemble +{ + my ($file, $include, $doReuse, $nowarn) = @_; + + my $regMap = {}; + $file = Preprocess($file, $include, 0, $regMap); + my $vectors = delete $regMap->{__vectors}; + my $regBank = delete $regMap->{__regbank}; + + # initialize cubin counts + my $regCnt = 0; + my $barCnt = 0; + + my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse); + + # initialize the first control instruction + push @instructs, $ctrl = {}; + + foreach my $line (split "\n", $file) + { + # keep track of line nums in the physical file + $lineNum++; + + next unless preProcessLine($line); + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # Save us from crashing the display driver + die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n" + if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0; + + # track branches/jumps/calls/etc for label remapping + push @branches, @instructs+0 if exists $jumpOp{$inst->{op}}; + + # push the control code onto the control instruction + push @{$ctrl->{ctrl}}, $inst->{ctrl}; + + # now point the instruction to its associated control instruction + $inst->{ctrl} = $ctrl; + + # add the op name and full instruction text + push @instructs, $inst; + + # add a 4th control instruction for every 3 instructions + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + # map the label name to the index of the instruction about to be inserted + $labels{$1} = @instructs+0; + } + else + { + die "badly formed line at $lineNum: $line\n"; + } + } + # add the final BRA op and align the number of instructions to a multiple of 8 + push @{$ctrl->{ctrl}}, 0x007ff; + push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' }; + while (@instructs & 7) + { + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + push @{$ctrl->{ctrl}}, 0x007e0; + push @instructs, { op => 'NOP', inst => 'NOP;' }; + } + + # remap labels + foreach my $i (@branches) + { + if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1}) + { die "instruction has invalid label: $instructs[$i]{inst}"; } + + $instructs[$i]{jump} = $labels{$1}; + + if (exists $relOffset{$instructs[$i]{op}}) + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; } + else + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; } + } + + # calculate optimal register reuse + # This effects register bank decisions so do it before analyzing register use + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + if ($doReuse) + { + # get any vector registers for r0 + my @r0 = getVecRegisters($vectors, $capData); + + # There are 2 reuse slots per register slot + # The reuse hash points to most recent instruction index where register was last used in this slot + + # For writes to a register, clear any reuse opportunity + if (@r0 && !exists $noDest{$op}) + { + foreach my $slot (keys %reuseSlots) + { + if (my $reuse = $reuse{$slot}) + { + # if writing with a vector op, clear all linked registers + delete $reuse->{$_} foreach @r0; + } + } + } + # clear cache if jumping elsewhere + %reuse = () if exists $jumpOp{$op}; + + # only track register reuse for instruction types this works with + if ($gram->{type}{reuse}) + { + foreach my $slot (keys %reuseSlots) + { + next unless exists $capData->{$slot}; + + my $r = $capData->{$slot}; + next if $r eq 'RZ'; + next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction + + my $reuse = $reuse{$slot} ||= {}; + + # if this register was previously marked for potential reuse + if (my $p = $reuse->{$r}) + { + # flag the previous instruction's ctrl reuse array slot + $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot}; + + #print "reuse $slot $r $instructs[$p]{inst}\n"; + } + # list full, delete the oldest + elsif (keys %$reuse > 2) + { + my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0]; + delete $reuse->{$oldest}; + } + # mark the new instruction for potential reuse + $reuse->{$r} = $i; + } + } + } + # if reuse is disabled then pull value from code. + elsif ($gram->{type}{reuse}) + { + $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData); + } + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # Assign registers to requested banks if possible + foreach my $r (sort keys %$regBank) + { + my $bank = $regBank->{$r}; + my $avail = $regMap->{$r}; + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + last; + } + } + } + + # calculate register live times and preferred banks for non-fixed registers. + # LiveTime only half implemented... + my (%liveTime, %pairedBanks, %reuseHistory); + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + my $reuseType = $gram->{type}{reuse}; + + # liveTimes and bank conflicts with source operands + my (%addReuse, %delReuse); + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r}; + + # All registers should be written prior to being read.. + if (my $liveTime = $liveTime{$liveR}) + { + # for each read set the current instruction index as the high value + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + + # Is this register active in the reuse cache? + my $slotHist = $reuseHistory{$slot} ||= {}; + my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0; + + #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3'; + + # If this is an auto reg, look at the open banks. + # No need to look at banks if this register is in the reuse cache. + if (!$selfReuse && ref $regMap->{$r}) + { + # Look at other source operands in this instruction and flag what banks are being used + foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39)) + { + my $r2 = $capData->{$slot2}; + next if $r2 eq 'RZ' || $r2 eq $r; + + my $slotHist2 = $reuseHistory{$slot2} ||= {}; + + #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3'; + + # Dont be concerned with non-reuse type instructions or + # If this operand is in the reuse cache, we don't care what bank it's on. + if (!$reuseType || !exists $slotHist2->{$r2}) + { + # if the operand is also an auto-allocated register then link them + # Once we choose the bank for one we want to update that choice for the other register. + if (ref $regMap->{$r2}) + { + push @{$pairedBanks{$r}{pairs}}, $r2; + $pairedBanks{$r}{banks} ||= []; + } + # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid. + else + { + my $bank = substr($regMap->{$r2},1) & 3; + #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3'; + + $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++; + $pairedBanks{$r}{pairs} ||= []; + } + # Update the total use count for this register. + # This will be the number of times the register is pulled out of the bank. + $pairedBanks{$r}{useCnt}++; + } + } + } + # update the reuse history so we know which bank conflicts we can ignore. + if ($reuseType) + { + # flag these slots for addition or removal from reuseHistory + if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot}) + { $addReuse{$slot} = $r; } + else + { $delReuse{$slot} = $r; } + } + } + # update reuse history after we're done with the instruction (when the flag is actually in effect). + # we don't want to updated it in the middle since that can interfere with the checks, + $reuseHistory{$_}{$addReuse{$_}} = 1 foreach keys %addReuse; + delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse; + + # liveTimes for destination operands and vector registers + foreach my $r0 (getVecRegisters($vectors, $capData)) + { + # fixed register mappings can have aliases so use the actual register value for those. + my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0}; + + # If not writing treat just like a read + if (exists $noDest{$op}) + { + if (my $liveTime = $liveTime{$liveR}) + { + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r0): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + } + # If writing, push a new bracket on this register's stack. + elsif (my $liveTime = $liveTime{$liveR}) + { + if ($i > $liveTime->[$#$liveTime][1]) + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + else + { + # Initialize the liveTime stack for this register. + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + #print Dumper(\%liveTime); exit(1); + + # assign unassigned registers + # sort by most restricted, then most used, then name + foreach my $r (sort { + $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} || + $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} || + $a cmp $b + } keys %pairedBanks) + { + my $banks = $pairedBanks{$r}{banks}; + my $avail = $regMap->{$r}; + + #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail; + + # Pick a bank with zero or the smallest number of conflicts + BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3)) + { + # pick an available register that matches the requested bank + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + + # update bank info for any unassigned pair + $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}}; + last BANK; + } + } + } + } + # Now assign any remaining to first available + foreach my $r (sort keys %$regMap) + { + if (ref($regMap->{$r}) eq 'ARRAY') + { + $regMap->{$r} = 'R' . shift @{$regMap->{$r}}; + } + } + #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap; + + # apply the register mapping and assemble the instructions to op codes + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + # save the original and replace the register names with numbers + $instructs[$i]{orig} = $instructs[$i]{inst}; + $instructs[$i]{inst} =~ s/(?{$1}) ? $regMap->{$1} : $1 /ge; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + # update the register count + foreach my $r (qw(r0 r8 r20 r39)) + { + next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ'; + + # get numeric portion of regname + my $val = substr $capData->{$r}, 1; + + my @r0 = getVecRegisters($vectors, $capData); + my @r8 = getAddrVecRegisters($vectors, $capData); + + # smart enough to count vector registers for memory instructions. + my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1; + my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1; + + if ($val + $regInc > $regCnt) + { + $regCnt = $val + $regInc; + #print "$val $regCnt $regInc\n"; + } + } + # update the barrier resource count + if ($op eq 'BAR') + { + if (exists $capData->{i8w4}) + { + $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt; + } + # if a barrier value is a register, assume the maximum + elsif (exists $capData->{r8}) + { + $barCnt = 16; + } + } + # Generate the op code. + my ($code, $reuse) = genCode($op, $gram, $capData); + $instructs[$i]{code} = $code; + + # cache this for final pass when we want to calculate reuse stats. + if ($gram->{type}{reuse}) + { $instructs[$i]{caps} = $capData; } + # use the parsed value of reuse for non-reuse type instructions + else + { $ctrl->{reuse}[($i & 3) - 1] = $reuse; } + + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # final pass to piece together control codes + my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed); + foreach my $i (0 .. $#instructs) + { + # op code + if ($i & 3) + { + push @codes, $instructs[$i]{code}; + + if ($instructs[$i]{caps}) + { + # calculate stats on registers + registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn); + } + if ($instructs[$i]{inst} =~ m'EXIT') + { + push @exitOffsets, (scalar(@codes)-1)*8; + } + elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)') + { + push @ctaidOffsets, (scalar(@codes)-1)*8; + $ctaidzUsed = 1 if $1 eq 'Z'; + } + } + # control code + else + { + my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)}; + push @codes, + ($ctrl->[0] << 0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes + ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59); # reuse codes + } + } + + # return the kernel data + return { + RegCnt => $regCnt, + BarCnt => $barCnt, + ExitOffsets => \@exitOffsets, + CTAIDOffsets => \@ctaidOffsets, + CTAIDZUsed => $ctaidzUsed, + ConflictCnt => $reuseHistory{conflicts}, + ReuseCnt => $reuseHistory{reuse}, + ReuseTot => $reuseHistory{total}, + ReusePct => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0), + KernelData => \@codes, + }; +} + +# Useful for testing op code coverage of existing code, extracting new codes and flags +sub Test +{ + my ($fh, $printConflicts, $all) = @_; + + my @instructs; + my %reuseHistory; + my ($pass, $fail) = (0,0); + + while (my $line = <$fh>) + { + my (@ctrl, @reuse); + + next unless processSassCtrlLine($line, \@ctrl, \@reuse); + + foreach my $fileReuse (@reuse) + { + $line = <$fh>; + + my $inst = processSassLine($line) or next; + + $inst->{reuse} = $fileReuse; + my $fileCode = $inst->{code}; + + if (exists $relOffset{$inst->{op}}) + { + # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump + $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e; + } + + my $match = 0; + foreach my $gram (@{$grammar{$inst->{op}}}) + { + my $capData = parseInstruct($inst->{inst}, $gram) or next; + my @caps; + + # Run in test mode to list what capture groups were captured + my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps); + + # Detect register bank conflicts but only for reuse type instructions. + # If a bank conflict is avoided by a reuse flag then ignore it. + registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse}; + + $inst->{caps} = join ', ', sort @caps; + $inst->{codeDiff} = $fileCode ^ $code; + $inst->{reuseDiff} = $fileReuse ^ $reuse; + + # compare calculated and file values + if ($code == $fileCode && $reuse == $fileReuse) + { + $inst->{grade} = 'PASS'; + push @instructs, $inst if $all; + $pass++; + } + else + { + $inst->{grade} = 'FAIL'; + push @instructs, $inst; + $fail++; + } + $match = 1; + last; + } + unless ($match) + { + $inst->{grade} = 'FAIL'; + $inst->{codeDiff} = $fileCode; + $inst->{reuseDiff} = $fileReuse; + push @instructs, $inst; + $fail++; + } + } + } + my %maxLen; + foreach (@instructs) + { + $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}}; + } + my ($lastOp, $template); + foreach my $inst (sort { + $a->{op} cmp $b->{op} || + $a->{codeDiff} <=> $b->{codeDiff} || + $a->{reuseDiff} <=> $b->{reuseDiff} || + $a->{ins} cmp $b->{ins} + } @instructs) + { + if ($lastOp ne $inst->{op}) + { + $lastOp = $inst->{op}; + $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s %s\n"; + printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures); + } + printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)}; + } + my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0; + + printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n", + $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total}; + + return $fail; +} + +# Convert cuobjdump sass to the working format +sub Extract +{ + my ($in, $out, $params) = @_; + + my %paramMap; + my %constants = + ( + blockDimX => 'c[0x0][0x8]', + blockDimY => 'c[0x0][0xc]', + blockDimZ => 'c[0x0][0x10]', + gridDimX => 'c[0x0][0x14]', + gridDimY => 'c[0x0][0x18]', + gridDimZ => 'c[0x0][0x1c]', + ); + print $out "\n"; + + foreach my $const (sort keys %constants) + { + print $out " $const : $constants{$const}\n"; + $paramMap{$constants{$const}} = $const; + } + print $out "\n"; + + foreach my $p (@$params) + { + my ($ord,$offset,$size,$align) = split ':', $p; + + if ($size > 4) + { + my $num = 0; + $offset = hex $offset; + while ($size > 0) + { + my $param = sprintf 'param_%d[%d]', $ord, $num; + my $const = sprintf 'c[0x0][0x%x]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + $size -= 4; + $offset += 4; + $num += 1; + } + } + else + { + my $param = sprintf 'param_%d', $ord; + my $const = sprintf 'c[0x0][%s]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + } + } + print $out "\n\n"; + + my %labels; + my $labelnum = 1; + + my @data; + FILE: while (my $line = <$in>) + { + my (@ctrl, @ruse); + next unless processSassCtrlLine($line, \@ctrl, \@ruse); + + CTRL: foreach my $ctrl (@ctrl) + { + $line = <$in>; + + my $inst = processSassLine($line) or next CTRL; + + # Convert branch/jump/call addresses to labels + if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)') + { + my $target = hex($1); + + # skip the final BRA and stop processing the file + last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8); + + # check to see if we've already generated a label for this target address + my $label = $labels{$target}; + unless ($label) + { + # generate a label name and cache it + $label = $labels{$target} = "TARGET$labelnum"; + $labelnum++; + } + # replace address with name + $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/; + } + $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg; + + $inst->{ctrl} = printCtrl($ctrl); + + push @data, $inst; + } + } + # make a second pass now that we have the complete instruction address to label mapping + foreach my $inst (@data) + { + print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}}; + printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)}; + } +} + +my $CommentRe = qr'^[\t ]*.*?^\s*\n?'ms; +my $IncludeRe = qr'^[\t ]*\n?'ms; +my $CodeRe = qr'^[\t ]*(.*?)^\s*<\/CODE\1>\n?'ms; +my $ConstMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $RegMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $ScheduleRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $InlineRe = qr'\[(\+|\-)(.+?)\1\]'ms; + +sub IncludeFile +{ + my ($file, $include) = @_; + my ($vol,$dir,$name) = File::Spec->splitpath($file); + local $/; + my $fh; + if (!open $fh, $file) + { + open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n"; + } + my $content = <$fh>; + close $fh; + return $content; +} + +sub Preprocess +{ + my ($file, $include, $debug, $regMap) = @_; + + my $constMap = {}; + my $removeRegMap; + if ($regMap) + { $removeRegMap = 1; } + else + { $regMap = {}; } + + # include nested files + 1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg; + + # Strip out comments + $file =~ s|$CommentRe||g; + + # Execute the CODE sections (old way to run code, to be deprecated) + 1 while $file =~ s|$CodeRe| + my $out = eval "package MaxAs::MaxAs::CODE; $2"; + $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg; + + # Execute the inline code (new way) + $file =~ s|$InlineRe| + my ($type, $code) = ($1, $2); + my $out = eval "package MaxAs::MaxAs::CODE; $code"; + $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg; + + #Pull in the constMap + $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg; + + my @newFile; + foreach my $line (split "\n", $file) + { + # skip comments + if ($line !~ m'^\s*(?:#|//).*') + { + $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg; + } + push @newFile, $line; + } + $file = join "\n", @newFile; + + # Pull in the reg map first as the Scheduler will need it to handle vector instructions + # Remove the regmap if we're going on to assemble + $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg; + + # Pick out the SCHEDULE_BLOCK sections + my @schedBlocks = $file =~ /$ScheduleRe/g; + + # Schedule them + foreach my $i (0 .. $#schedBlocks) + { + # XMAD macros should only appear in SCHEDULE_BLOCKs + $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]); + + $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug); + } + + # Replace the results + $file =~ s|$ScheduleRe| shift @schedBlocks |eg; + + return $file; +} + +# break the registers down into source and destination categories for the scheduler +my %srcReg = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X); +my %destReg = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC); +my %regops = (%srcReg, %destReg); +my @itypes = qw(class lat rlat tput dual); + +sub Scheduler +{ + my ($block, $blockNum, $regMap, $debug) = @_; + + my $vectors = $regMap->{__vectors}; + my $lineNum = 0; + + my (@instructs, @comments, $ordered, $first); + foreach my $line (split "\n", $block) + { + # keep track of line nums in the physical file + $lineNum++; + + unless (preProcessLine($line)) + { + push @comments, $line if $line =~ m'\S'; + next; + } + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # if the first instruction in the block is waiting on a dep, it should go first. + $inst->{first} = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1; + + # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block) + #$inst->{first} = $inst->{ctrl} & 0x0000f ? 1 : 2; + $inst->{exeTime} = 0; + $inst->{order} = $ordered++ if $ordered; + push @instructs, $inst; + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n"; + } + # open an ORDERED block + elsif ($line =~ m'^') + { + die "you cannot use nested tags" if $ordered; + $ordered = 1; + } + # close an ORDERED block + elsif ($line =~ m'^') + { + die "missing opening for closing tag" if !$ordered; + $ordered = 0; + } + else + { + die "badly formed line at block: $blockNum line: $lineNum: $line\n"; + } + } + + my (%writes, %reads, @ready, @schedule, $orderedParent); + # assemble the instructions to op codes + foreach my $instruct (@instructs) + { + my $match = 0; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $capData = parseInstruct($instruct->{inst}, $gram) or next; + my (@dest, @src); + + # copy over instruction types for easier access + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + + # A predicate prefix is treated as a source reg + push @src, $instruct->{predReg} if $instruct->{pred}; + + # Handle P2R and R2P specially + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src; + my $mask = hex($capData->{i20w7}); + foreach my $p (0..6) + { + if ($mask & (1 << $p)) + { + push @$list, "P$p"; + } + # make this instruction dependent on any predicates it's not setting + # this is to prevent a race condition for any predicate sets that are pending + elsif ($instruct->{op} eq 'R2P') + { + push @src, "P$p"; + } + } + # These instructions can't be dual issued + $instruct->{nodual} = 1; + } + + # Populate our register source and destination lists, skipping any zero or true values + foreach my $operand (grep { exists $regops{$_} } sort keys %$capData) + { + # figure out which list to populate + my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src; + + # Filter out RZ and PT + my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT'; + + if ($capData->{$operand} ne $badVal) + { + # add the value to list with the correct prefix + push @$list, + $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) : + $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) : + $operand eq 'CC' ? 'CC' : + $operand eq 'X' ? 'CC' : + getRegNum($regMap, $capData->{$operand}); + } + } + $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39}); + + # Find Read-After-Write dependencies + foreach my $src (grep { exists $writes{$_} } @src) + { + # Memory operations get delayed access to registers but not to the predicate + my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat}; + + # the parent should be the most recently added dest op to the stack + foreach my $parent (@{$writes{$src}}) + { + # add this instruction as a child of the parent + # set the edge to the total latency of reg source availability + #print "R $parent->{inst}\n\t\t$instruct->{inst}\n"; + my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat}; + push @{$parent->{children}}, [$instruct, $latency - $regLatency]; + $instruct->{parents}++; + + # if the destination was conditionally executed, we also need to keep going back till it wasn't + last unless $parent->{pred}; + } + } + + # Find Write-After-Read dependencies + foreach my $dest (grep { exists $reads{$_} } @dest) + { + # Flag this instruction as dependent to any previous read + foreach my $reader (@{$reads{$dest}}) + { + # no need to stall for these types of dependencies + #print "W $reader->{inst} \t\t\t $instruct->{inst}\n"; + push @{$reader->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + # Once dependence is marked we can clear out the read list (unless this write was conditional). + # The assumption here is that you would never want to write out a register without + # subsequently reading it in some way prior to writing it again. + delete $reads{$dest} unless $instruct->{pred}; + } + + # Enforce instruction ordering where requested + if ($instruct->{order}) + { + if ($orderedParent) + { + push @{$orderedParent->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + $orderedParent = $instruct; + } + elsif ($orderedParent) + { $orderedParent = 0; } + + # For a dest reg, push it onto the write stack + unshift @{$writes{$_}}, $instruct foreach @dest; + + # For a src reg, push it into the read list + push @{$reads{$_}}, $instruct foreach @src; + + # if this instruction has no dependencies it's ready to go + push @ready, $instruct if !exists $instruct->{parents}; + + $match = 1; + last; + } + die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match; + } + %writes = (); + %reads = (); + + if (@ready) + { + # update dependent counts for sorting hueristic + my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" }; + + countUniqueDescendants($readyParent, {}); + updateDepCounts($readyParent, {}); + + # sort the initial ready list + @ready = sort { + $a->{first} <=> $b->{first} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + # Process the ready list, adding new instructions to the list as we go. + my $clock = 0; + while (my $instruct = shift @ready) + { + my $stall = $instruct->{stall}; + + # apply the stall to the previous instruction + if (@schedule && $stall < 16) + { + my $prev = $schedule[$#schedule]; + + # if stall is greater than 4 then also yield + # the yield flag is required to get stall counts 12-15 working correctly. + $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0; + $prev->{ctrl} |= $stall; + $clock += $stall; + } + # For stalls bigger than 15 we assume the user is managing it with a barrier + else + { + $instruct->{ctrl} &= 0x1fff0; + $instruct->{ctrl} |= 1; + $clock += 1; + } + print "$clock: $instruct->{inst}\n" if $debug; + + # add a new instruction to the schedule + push @schedule, $instruct; + + # update each child with a new earliest execution time + if (my $children = $instruct->{children}) + { + foreach (@$children) + { + my ($child, $latency) = @$_; + + # update the earliest clock value this child can safely execute + my $earliest = $clock + $latency; + $child->{exeTime} = $earliest if $child->{exeTime} < $earliest; + + print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug; + + # decrement parent count and add to ready queue if none remaining. + push @ready, $child if --$child->{parents} < 1; + } + delete $instruct->{children}; + } + + # update stall and mix values in the ready queue on each iteration + foreach my $ready (@ready) + { + # calculate how many instructions this would cause the just added instruction to stall. + $stall = $ready->{exeTime} - $clock; + $stall = 1 if $stall < 1; + + # if using the same compute resource as the prior instruction then limit the throughput + if ($ready->{class} eq $instruct->{class}) + { + $stall = $ready->{tput} if $stall < $ready->{tput}; + } + # dual issue with a simple instruction (tput <= 2) + # can't dual issue two instructions that both load a constant + elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} && + $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const})) + { + $stall = 0; + } + $ready->{stall} = $stall; + + # add an instruction class mixing huristic that catches anything not handled by the stall + $ready->{mix} = $ready->{class} ne $instruct->{class} || 0; + } + + # sort the ready list by stall time, mixing huristic, dependencies and line number + @ready = sort { + $a->{first} <=> $b->{first} || + $a->{stall} <=> $b->{stall} || + $b->{mix} <=> $a->{mix} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + my $out; + #$out .= "$_\n" foreach @comments; + $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule; + return $out; +} + +sub setConstMap +{ + my ($constMap, $constMapText) = @_; + + foreach my $line (split "\n", $constMapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my ($name, $value) = split '\s*:\s*', $line; + + $constMap->{$name} = $value; + } + return; +} + +sub setRegisterMap +{ + my ($regMap, $regmapText) = @_; + + my $vectors = $regMap->{__vectors} ||= {}; + my $regBank = $regMap->{__regbank} ||= {}; + my %aliases; + + foreach my $line (split "\n", $regmapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my $auto = $line =~ /~/; + my $share = $line =~ /=/; + + my ($regNums, $regNames) = split '\s*[:~=]\s*', $line; + + my (@numList, @nameList, %vecAliases); + foreach my $num (split '\s*,\s*', $regNums) + { + my ($start, $stop) = split '\s*\-\s*', $num; + die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop; + push @numList, ($start .. $stop||$start); + } + foreach my $fullName (split '\s*,\s*', $regNames) + { + if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$') + { + my ($name1, $name2, $bank) = ($1, $3, $4); + foreach (split '\s*\|\s*', $2) + { + my ($start, $stop) = split '\s*\-\s*'; + foreach my $r (map "$name1$_$name2", $start .. $stop||$start) + { + # define an alias for use in vector instructions that omits the number portion + $aliases{$r} = "$name1$name2" unless exists $aliases{$r}; + push @nameList, $r; + $regBank->{$r} = $bank if $auto && defined $bank; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank; + } + } + } + elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$') + { + push @nameList, $1; + $regBank->{$1} = $2 if $auto && defined $2; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2; + } + else + { + die "Bad register name: '$fullName' at: $line\n"; + } + } + die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList; + die "Missmatched register mapping at: $line\n" if $share && @numList > 1; + + # detect if this list is monotonically ascending with no gaps + my $i = 0; + while ($i < $#numList-1) + { + last if $numList[$i] + 1 != $numList[$i+1]; + $i++; + } + my $ascending = $i+1 == $#numList; + + foreach my $n (0..$#nameList) + { + die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]}; + + if ($auto) + { + # assign possible values to be assigned on assembly + $regMap->{$nameList[$n]} = \@numList; + } + elsif ($share) + { + # each name shares the same single register + $regMap->{$nameList[$n]} = 'R' . $numList[0]; + } + else + { + $regMap->{$nameList[$n]} = 'R' . $numList[$n]; + # flag any even register as a potential vector + if ($ascending && ($numList[$n] & 1) == 0) + { + # constrain potential range to vector alignment + my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3); + if ($end <= $#nameList) + { + $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ]; + #setup an alias for the base name without the number + if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}}) + { + $regMap->{$aliases{$nameList[$n]}} = $regMap->{$nameList[$n]}; + $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]}; + delete $aliases{$nameList[$n]}; + } + } + } + } + } + } + #print Dumper($regMap); exit(1); +} + +sub preProcessLine +{ + # strip leading space + $_[0] =~ s|^\s+||; + + # preserve comment but check for emptiness + my $val = shift; + + # strip comments + $val =~ s{(?:#|//).*}{}; + + # skip blank lines + return $val =~ m'\S'; +} + +# traverse the graph and count total descendants per node. +# only count unique nodes (by lineNum) +sub countUniqueDescendants +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + + $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges); + } + } + else + { + return $node->{lineNum}; + } + return ($node->{lineNum}, keys %{$node->{deps}}); +} +# convert hash to count for easier sorting. +sub updateDepCounts +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (@$children) + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + updateDepCounts($child->[0], $edges); + } + } + $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0; +} + +# Detect register bank conflicts and calculate reuse stats +sub registerHealth +{ + my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_; + + my (@banks, @conflicts); + + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $slotHist = $reuseHistory->{$slot} ||= {}; + + $reuseHistory->{total}++; + + # if this register is in active reuse then ignore for bank conflict checking. + if (exists $slotHist->{$r}) + { + $reuseHistory->{reuse}++; + } + else + { + # extract number from reg and take the modulo-4 value. This is the bank id. + my $bank = substr($r,1) & 3; + + # check for conflict + if ($banks[$bank] && $banks[$bank] ne $r) + { + push @conflicts, $banks[$bank] if !@conflicts; + push @conflicts, $r; + + $reuseHistory->{conflicts}++; + } + $banks[$bank] = $r; + } + + # update the history + if ($reuseFlags & $reuseSlots{$slot}) + { $slotHist->{$r} = 1; } + else + { delete $slotHist->{$r}; } + } + if ($inst && @conflicts && !$nowarn) + { + printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts); + } + return scalar @conflicts; +} + +1; + +__END__ + +=head1 NAME + +MaxAs::MaxAs - Assembler for NVIDIA Maxwell architecture + +=head1 SYNOPSIS + + maxas.pl [opts] + +=head1 DESCRIPTION + +See the documentation at: https://github.com/NervanaSystems/maxas + +=head1 SEE ALSO + +See the documentation at: https://github.com/NervanaSystems/maxas + + +=head1 AUTHOR + +Scott Gray, Esgray@nervanasys.com + +=head1 COPYRIGHT AND LICENSE + +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +=cut diff --git a/Assembler/MaxAs/blib/lib/MaxAs/MaxAsGrammar.pm b/Assembler/MaxAs/blib/lib/MaxAs/MaxAsGrammar.pm new file mode 100644 index 0000000..fc61543 --- /dev/null +++ b/Assembler/MaxAs/blib/lib/MaxAs/MaxAsGrammar.pm @@ -0,0 +1,1437 @@ +package MaxAs::MaxAsGrammar; + +use strict; +use Carp; +use Exporter; +use Data::Dumper; +our @ISA = qw(Exporter); + +our @EXPORT = qw( + %grammar %flags + parseInstruct genCode genReuseCode + processAsmLine processSassLine processSassCtrlLine + replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters +); + +require 5.10.0; + +# Helper functions for operands +sub getI +{ + my ($orig, $pos, $mask) = @_; + my $val = $orig; + my $neg = $val =~ s|^\-||; + + # parse out our custom index immediates for addresses + if ($val =~ m'^(\d+)[xX]<([^>]+)>') + { + # allow any perl expression and multiply result by leading decimal. + # also allow global scalar varibles in the expression. + my $mul = $1; + my $exp = $2; + # strip leading zeros (don't interpret numbers as octal) + $exp =~ s/(?> $trunc) & 0x7ffff if $trunc; + } + return $val << $pos; +} +sub getR +{ + my ($val, $pos) = @_; + if ($val =~ m'^R(\d+|Z)$' && $1 < 255) + { + $val = $1 eq 'Z' ? 0xff : $1; + } + else + { + die "Bad register name found: $val\n"; + } + return $val << $pos; +} +sub getP +{ + my ($val, $pos) = @_; + if ($val =~ m'^P(\d|T)$' && $1 < 7) + { + $val = $1 eq 'T' ? 7 : $1; + } + else + { + die "Bad predicate name found: $val\n"; + } + return $val << $pos; +} +sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 } + +# Map operands into their value and position in the op code. +my %operands = +( + p0 => sub { getP($_[0], 0) }, + p3 => sub { getP($_[0], 3) }, + p12 => sub { getP($_[0], 12) }, + p29 => sub { getP($_[0], 29) }, + p39 => sub { getP($_[0], 39) }, + p45 => sub { getP($_[0], 45) }, + p48 => sub { getP($_[0], 48) }, + p58 => sub { getP($_[0], 58) }, + r0 => sub { getR($_[0], 0) }, + r8 => sub { getR($_[0], 8) }, + r20 => sub { getR($_[0], 20) }, + r28 => sub { getR($_[0], 28) }, + r39s20 => sub { getR($_[0], 39) }, + r39 => sub { getR($_[0], 39) }, + r39a => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20 + c20 => sub { getC($_[0]) }, + c39 => sub { getC($_[0]) }, + c34 => sub { hex($_[0]) << 34 }, + c36 => sub { hex($_[0]) << 36 }, + f20w32 => sub { getF($_[0], 20, 'f') }, + f20 => sub { getF($_[0], 20, 'f', 12) }, + d20 => sub { getF($_[0], 20, 'd', 44) }, + i8w4 => sub { getI($_[0], 8, 0xf) }, + i20 => sub { getI($_[0], 20, 0x7ffff) }, + i20w6 => sub { getI($_[0], 20, 0x3f) }, + i20w7 => sub { getI($_[0], 20, 0x7f) }, + i20w8 => sub { getI($_[0], 20, 0xff) }, + i20w12 => sub { getI($_[0], 20, 0xfff) }, + i20w24 => sub { getI($_[0], 20, 0xffffff) }, + i20w32 => sub { getI($_[0], 20, 0xffffffff) }, + i31w4 => sub { getI($_[0], 31, 0xf) }, + i34w13 => sub { getI($_[0], 34, 0x1fff) }, + i36w20 => sub { getI($_[0], 36, 0xfffff) }, + i39w8 => sub { getI($_[0], 39, 0xff) }, + i28w8 => sub { getI($_[0], 28, 0xff) }, + i28w20 => sub { getI($_[0], 28, 0xfffff) }, + i48w8 => sub { getI($_[0], 48, 0xff) }, + i51w5 => sub { getI($_[0], 51, 0x1f) }, + i53w5 => sub { getI($_[0], 53, 0x1f) }, +); + +# Rules for operands and their closely tied flags +my $hex = qr"0[xX][0-9a-fA-F]+"; +my $iAddr = qr"\d+[xX]<[^>]+>"; +my $immed = qr"$hex|$iAddr|\d+"o; +my $reg = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\ +my $p = qr"P[0-6T]"; +my $noPred = qr"(?)"; +my $pred = qr"\@(?\!)?P(?[0-6]) "; +my $p0 = qr"(?$p)"o; +my $p3 = qr"(?$p)"o; +my $p12 = qr"(?\!)?(?$p)"o; +my $p29 = qr"(?\!)?(?$p)"o; +my $p39 = qr"(?\!)?(?$p)"o; +my $p45 = qr"(?$p)"o; +my $p48 = qr"(?$p)"o; +my $p58 = qr"(?$p)"o; +my $r0 = qr"(?$reg)"; +my $r0cc = qr"(?$reg)(?\.CC)?"; +my $r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r28 = qr"(?$reg)"; +my $r39s20 = qr"(?\-)?(?\|)?(?(?$reg))\|?(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39 = qr"(?\-)?(?$reg)(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39a = qr"(?(?$reg))(?\.reuse)?"; +my $c20 = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20x = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20s39 = qr"(?\-)?c\[(?$hex)\]\s*\[(?$hex)\]"o; +my $f20w32 = qr"(?(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))"; +my $f20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $d20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $i8w4 = qr"(?$immed)"o; +my $i20 = qr"(?(?\-)?$immed)(?\.NEG)?"o; +my $i20w6 = qr"(?$immed)"o; +my $i20w7 = qr"(?$immed)"o; +my $i20w8 = qr"(?$immed)"o; +my $i20w12 = qr"(?$immed)"o; +my $i20w24 = qr"(?\-?$immed)"o; +my $i20w32 = qr"(?\-?$immed)"o; +my $i39w8 = qr"(?\-?$immed)"o; +my $i28w8 = qr"(?$immed)"o; +my $i28w20 = qr"(?\-?$immed)"o; +my $i31w4 = qr"(?$immed)"o; +my $i34w13 = qr"(?$immed)"o; +my $i36w20 = qr"(?$immed)"o; +my $i48w8 = qr"(?$immed)"o; +my $i51w5 = qr"(?$immed)"o; +my $i53w5 = qr"(?$immed)"o; +my $ir20 = qr"$i20|$r20"o; +my $cr20 = qr"$c20|$r20"o; +my $icr20 = qr"$i20|$c20|$r20"o; +my $fcr20 = qr"$f20|$c20|$r20"o; +my $cr39 = qr"$c20s39|$r39"o; +my $dr20 = qr"$d20|$r20"o; + +# Instruction specific rules for capturing various flags +my $u32 = qr"(?\.U32)?"; +my $ftz = qr"(?\.FTZ)?"; +my $sat = qr"(?\.SAT)?"; +my $rnd = qr"(?:\.(?RN|RM|RP|RZ))?"; +my $round = qr"(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?"; +my $fcmp = qr"(?\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)"; +my $icmp = qr"\.(?LT|EQ|LE|GT|NE|GE)"; +my $bool = qr"\.(?AND|OR|XOR|PASS_B)"; +my $bool2 = qr"\.(?AND|OR|XOR)"; +my $func = qr"\.(?COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)"; +my $rro = qr"\.(?SINCOS|EX2)"; +my $add3 = qr"(?:\.(?X|RS|LS))?"; +my $lopz = qr"(?:\.(?NZ|Z) $p48,|(?))"o; +my $X = qr"(?\.X)?"; +my $tld = qr"(?NODEP\.)?(?:(?T)|(?P))"; +my $chnls = qr"(?R|RGBA)"; +my $sr = qr"SR_(?\S+)"; +my $shf = qr"(?\.W)?(?:\.(?U64|S64))?(?\.HI)?"; +my $xmad = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $xmadc = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $vmad8 = qr"\.(?[SU])(?8|16)\.(?[SU])(?8|16)(?\.PO)?(?\.SHR_7)?(?\.SHR_15)?(?\.SAT)?"; +my $vmad16= qr"\.(?[SU])(?16)\.(?[SU])(?16)"; +my $hilo = qr"(?:\.(?XHI|XLO))?"; +my $vaddType = qr"(?:\.(?UD))?(?:\.(?SD))?(?:\.(?[SU])(?8|16|32))?(?:\.(?[SU])(?8|16|32))?"; +my $vaddMode = qr"(?:\.(?MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?"; +my $vmnmx = qr"(?:\.(?MX))?"; +my $x2x = qr"\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)"; +my $prmt = qr"(?:\.(?F4E|B4E|RC8|ECL|ECR|RC16))?"; +my $shfl = qr"\.(?IDX|UP|DOWN|BFLY)"; +my $bar = qr"\.(?SYNC|ARV|RED)(?:\.(?POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?()|(?))(?(), $p39|(?))"o; +my $b2r = qr"\.RESULT $r0(?:, $p45|(?))"o; +my $dbar = qr"(?SB0|SB1|SB2|SB3|SB4|SB5)"; +my $dbar2 = qr" {(?5)?,?(?4)?,?(?3)?,?(?2)?,?(?1)?,?(?0)?}"; +my $mbar = qr"\.(?CTA|GL|SYS)"; +my $addr = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i20w24)?\]"o; +my $addr2 = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i28w20)?\]"o; +my $ldc = qr"c\[(?$hex)\]\s*$addr"o; +my $atom = qr"(?\.E)?(?:\.(?ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)"; +my $vote = qr"\.(?ALL|ANY|EQ)"o; +my $memType = qr"(?\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)"; +my $memCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CI|CS|CV|IL|WT))?"; + + + +# class: hardware resource that shares characteristics with types +# lat : pipeline depth where relevent, placeholder for memory ops +# blat : barrier latency, typical fetch time for memory operations. Highly variable. +# rlat : operand read latency for memory ops +# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op. +# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession. +# dual : whether this instruction type can be dual issued +# reuse: whether this instruction type accepts register reuse flags. + +# Some of these values are guesses and need to be updated from micro benchmarks. +# We may need to split these classes up further. +my $s2rT = {class => 's2r', lat => 2, blat => 25, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $smemT = {class => 'mem', lat => 2, blat => 30, rlat => 2, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $gmemT = {class => 'mem', lat => 2, blat => 200, rlat => 4, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $x32T = {class => 'x32', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 1}; +my $x64T = {class => 'x64', lat => 2, blat => 128, rlat => 0, rhold => 0, tput => 128, dual => 0, reuse => 1}; +my $shftT = {class => 'shift', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $cmpT = {class => 'cmp', lat => 13, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $qtrT = {class => 'qtr', lat => 8, blat => 0, rlat => 4, rhold => 0, tput => 1, dual => 1, reuse => 0}; +my $rroT = {class => 'rro', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $voteT = {class => 'vote', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; + + +# Create map of op names to rules +our %grammar = +( + #Floating Point Instructions + FADD => [ { type => $x32T, code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FADD32I => [ { type => $x32T, code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o, } ], + FCHK => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o, } ], #Partial? + FCMP => [ { type => $cmpT, code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o, } ], + FFMA => [ + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o, }, + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o, }, + ], + FMNMX => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o, } ], + FMUL => [ { type => $x32T, code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FMUL32I => [ { type => $x32T, code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o, } ], + FSET => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o, } ], + FSETP => [ { type => $cmpT, code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], + MUFU => [ { type => $qtrT, code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o, } ], + RRO => [ { type => $rroT, code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o, } ], + DADD => [ { type => $x64T, code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o, } ], + DFMA => [ { type => $x64T, code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o, } ], + DMNMX => [ { type => $cmpT, code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o, } ], + DMUL => [ { type => $x64T, code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o, } ], + DSET => [ { type => $cmpT, code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o, } ], + DSETP => [ { type => $cmpT, code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o, } ], + FSWZADD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o, } ], #TODO + + HADD2 => [ { type => $x32T, code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o, } ], + HMUL2 => [ { type => $x32T, code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o, } ], + HFMA2 => [ { type => $x32T, code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o, } ], + HSETP2 => [ { type => $cmpT, code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial + + #Integer Instructions + BFE => [ { type => $shftT, code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o, } ], + BFI => [ { type => $shftT, code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o, } ], + FLO => [ { type => $s2rT, code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o, } ], + IADD => [ { type => $x32T, code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o, } ], + IADD32I => [ { type => $x32T, code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o, } ], + IADD3 => [ { type => $x32T, code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o, } ], + ICMP => [ { type => $cmpT, code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o, } ], + IMNMX => [ { type => $shftT, code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o, } ], + ISET => [ { type => $shftT, code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o, } ], + ISETP => [ { type => $cmpT, code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ], + ISCADD => [ { type => $shftT, code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o, } ], + ISCADD32I => [ { type => $shftT, code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o, } ], + LEA => [ + { type => $cmpT, code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o, }, + { type => $shftT, code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o, }, + { type => $shftT, code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o, }, + { type => $shftT, code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o, }, + ], + LOP => [ { type => $x32T, code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?~)?$icr20(?\.INV)?;"o, } ], + LOP32I => [ { type => $x32T, code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o, } ], + LOP3 => [ + { type => $x32T, code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o, }, + { type => $x32T, code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o, }, + ], + POPC => [ { type => $s2rT, code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o, } ], + SHF => [ + { type => $shftT, code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o, }, + { type => $shftT, code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o, }, + ], + SHL => [ { type => $shftT, code => 0x5c48000000000000, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $icr20;"o, } ], + SHR => [ { type => $shftT, code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o, } ], + XMAD => [ + { type => $x32T, code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o, }, + { type => $x32T, code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o, }, + ], + # XMAD replaces these + IMAD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o, } ], #TODO + IMADSP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO + IMUL => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o, } ], #TODO + + #Conversion Instructions + F2F => [ { type => $qtrT, code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ], + F2I => [ { type => $qtrT, code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o, } ], + I2F => [ { type => $qtrT, code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o, } ], + I2I => [ { type => $qtrT, code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o, } ], + + #Movement Instructions + MOV => [ { type => $x32T, code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o, } ], + MOV32I => [ { type => $x32T, code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o, } ], + PRMT => [ { type => $x32T, code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ], + SEL => [ { type => $x32T, code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o, } ], + SHFL => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ], + + #Predicate/CC Instructions + PSET => [ { type => $cmpT, code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o, } ], + PSETP => [ { type => $cmpT, code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ], + CSET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o, } ], #TODO + CSETP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO + P2R => [ { type => $x32T, code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o, } ], + R2P => [ { type => $cmpT, code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o, } ], + + #Texture Instructions + # Handle the commonly used 1D texture functions.. but save the others for later + TLD => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial + TLDS => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial + TEX => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o, } ], #TODO + TLD4 => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o, } ], #TODO + TXQ => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o, } ], #TODO + TEXS => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o, } ], #TODO + TLD4S => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO + + #Compute Load/Store Instructions + LD => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o, } ], + ST => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o, } ], + LDG => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o, } ], + STG => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o, } ], + LDS => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o, } ], + STS => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o, } ], + LDL => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o, } ], + STL => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o, } ], + LDC => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o, } ], + # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded). + ATOM => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + ATOMS => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + RED => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o, } ], + CCTL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o, } ], #TODO + CCTLL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO + CCTLT => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO + + #Surface Memory Instructions (haven't gotten to these yet..) + SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO + SULD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o, } ], #TODO + SURED => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o, } ], #TODO + SUST => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o, } ], #TODO + + #Control Instructions + BRA => [ + { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?\.U)? $i20w24;"o, }, + { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?\.U)? CC\.EQ, $i20w24;"o, }, + ], + BRX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o, } ], #TODO + JMP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o, } ], #TODO + JMX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o, } ], #TODO + SSY => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o, } ], + SYNC => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o, } ], + CAL => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o, } ], + JCAL => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o, } ], + PRET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o, } ], #TODO + RET => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o, } ], + BRK => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o, } ], + PBK => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o, } ], + CONT => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o, } ], + PCNT => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o, } ], + EXIT => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o, } ], + PEXIT => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o, } ], #TODO + BPT => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o, } ], + + #Miscellaneous Instructions + NOP => [ { type => $x32T, code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o, } ], + CS2R => [ { type => $x32T, code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o, } ], + S2R => [ { type => $s2rT, code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o, } ], + B2R => [ { type => $x32T, code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o, } ], + BAR => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o, } ], + DEPBAR => [ + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, }, + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o, }, + ], + MEMBAR => [ { type => $x32T, code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o, } ], + VOTE => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?))$p45, $p39;"o, } ], + R2B => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o, } ], #TODO + + #Video Instructions... Need to finish + VADD => [ { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMAD => [ + { type => $x32T, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, }, + ], + VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMNMX => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + + VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 +); + +# Create map of capture groups to op code flags that need to be added (or removed) +my @flags = grep /\S/, split "\n", q{; + +BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD +0x0100000000000000 neg + +FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP +0x0100000000000000 neg + +PSET, PSETP +0x0000000000008000 p12not +0x0000000100000000 p29not + +FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE +0x0000040000000000 p39not + +IADD, IADD3, XMAD, LEA, IMNMX +0x0000800000000000 CC + +IADD32I +0x0010000000000000 CC + +LEA +0x0000000000000000 X + +SHF +0x0004000000000000 W +0x0001000000000000 HI + +SHF: type +0x0000004000000000 U64 +0x0000006000000000 S64 + +SHR, IMNMX, ISETP, ISET, ICMP, BFE +0x0001000000000000 U32 + +SHL +0x0000008000000000 W + +SHFL +0x0000000010000000 i20w8 +0x0000000020000000 i34w13 + +SHFL: mode +0x0000000000000000 IDX +0x0000000040000000 UP +0x0000000080000000 DOWN +0x00000000c0000000 BFLY + +IMNMX: mode +0x0000080000000000 XLO +0x0000180000000000 XHI + +ISETP, ISET, ICMP: cmp +0x0002000000000000 LT +0x0004000000000000 EQ +0x0006000000000000 LE +0x0008000000000000 GT +0x000a000000000000 NE +0x000c000000000000 GE + +ISETP, ISET, PSETP, PSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +PSETP, PSET: bool2 +0x0000000000000000 AND +0x0000000001000000 OR +0x0000000002000000 XOR + +ISETP, ISET +0x0000080000000000 X + +LOP: bool +0x0000000000000000 AND +0x0000020000000000 OR +0x0000040000000000 XOR +0x0000060000000000 PASS_B + +LOP: +0x0000010000000000 INV + +LOP: z +0x0000200000000000 Z +0x0000300000000000 NZ + +LOP +0x0007000000000000 noz + +LOP32I: bool +0x0000000000000000 AND +0x0020000000000000 OR +0x0040000000000000 XOR + +PRMT: mode +0x0001000000000000 F4E +0x0002000000000000 B4E +0x0003000000000000 RC8 +0x0004000000000000 ECL +0x0005000000000000 ECR +0x0006000000000000 RC16 + +XMAD: type1 +0x0000000000000000 U16 +0x0001000000000000 S16 + +XMAD: type2 +0x0000000000000000 U16 +0x0002000000000000 S16 + +XMAD: mode +0x0000002000000000 MRG +0x0000001000000000 PSL +0x0008000000000000 CHI +0x0004000000000000 CLO +0x000c000000000000 CSFU + +XMAD: modec +0x0004000000000000 CLO +0x0008000000000000 CHI +0x000c000000000000 CSFU +0x0040000000000000 X +0x0080000000000000 PSL +0x0100000000000000 MRG + +XMAD +0x0010000000000000 CBCC + +XMAD: r8part +0x0000000000000000 H0 +0x0020000000000000 H1 + +XMAD: r20part +0x0000000000000000 H0 +0x0000000800000000 H1 + +XMAD: r20partx +0x0000000000000000 H0 +0x0010000000000000 H1 + +XMAD: r39part +0x0000000000000000 H0 +0x0010000000000000 H1 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part +0x0000000000000000 B0 +0x0000001000000000 B1 +0x0000002000000000 B2 +0x0000003000000000 B3 +0x0000001000000000 H1 +0x0000000000000000 H0 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part +0x0000000000000000 B0 +0x0000000010000000 B1 +0x0000000020000000 B2 +0x0000000030000000 B3 +0x0000000010000000 H1 +0x0000000000000000 H0 + +VMAD +0x0040000000000000 r8neg +0x0020000000000000 r39neg +0x0008000000000000 SHR_7 +0x0010000000000000 SHR_15 +0x0060000000000000 PO +0x0080000000000000 SAT + +VMNMX +0x0100000000000000 MX + +VADD, VABSDIFF, VMNMX +0x0080000000000000 SAT +0x0040000000000000 UD +0x0040000000000000 SD + +VSET: cmp +0x0040000000000000 LT +0x0080000000000000 EQ +0x00c0000000000000 LE +0x0100000000000000 GT +0x0140000000000000 NE +0x0180000000000000 GE + +VADD, VSET: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VABSDIFF: mode +0x0003000000000000 ACC +0x000b000000000000 MIN +0x0013000000000000 MAX +0x0023000000000000 MRG_16H +0x002b000000000000 MRG_16L +0x0033000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x003b000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMNMX: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1 +0x0000000000000000 U +0x0001000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2 +0x0000000000000000 U +0x0002000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size1 +0x0000000000000000 8 +0x0000004000000000 16 +0x0000006000000000 32 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size2 +0x0000000000000000 8 +0x0000000040000000 16 +0x0000000060000000 32 + +IADD3: type +0x0001000000000000 X +0x0000002000000000 RS +0x0000004000000000 LS + +IADD3: r8part +0x0000000000000000 H0 +0x0000001000000000 H1 + +IADD3: r20part +0x0000000080000000 H0 + +IADD3: r39part +0x0000000200000000 H0 + +IADD3 +0x0008000000000000 r8neg +0x0004000000000000 r20neg +0x0002000000000000 r39neg + +IADD +0x0000080000000000 X +0x0004000000000000 SAT + +IADD, ISCADD +0x0002000000000000 r8neg +0x0001000000000000 r20neg + +IADD32I +0x0100000000000000 r8neg +0x0020000000000000 X + +DEPBAR: SB +0x0000000000000000 SB0 +0x0000000004000000 SB1 +0x0000000008000000 SB2 +0x000000000c000000 SB3 +0x0000000010000000 SB4 +0x0000000014000000 SB5 + +DEPBAR: cmp +0x0000000020000000 LE + +DEPBAR +0x0000000000000001 db0 +0x0000000000000002 db1 +0x0000000000000004 db2 +0x0000000000000008 db3 +0x0000000000000010 db4 +0x0000000000000020 db5 + +F2F, F2I, I2F, I2I: destWidth +0x0000000000000000 8 +0x0000000000000100 16 +0x0000000000000200 32 +0x0000000000000300 64 + +F2F, F2I, I2F, I2I: srcWidth +0x0000000000000000 8 +0x0000000000000400 16 +0x0000000000000800 32 +0x0000000000000c00 64 + +F2F, F2I, I2F, I2I: destSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000001000 S + +F2F, F2I, I2F, I2I: srcSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000002000 S + +F2I, I2F, I2I: r20part +0x0000000000000000 H0 +0x0000040000000000 H1 +0x0000000000000000 B0 +0x0000020000000000 B1 +0x0000040000000000 B2 +0x0000060000000000 B3 + +F2F: r20part +0x0000000000000000 H0 +0x0000020000000000 H1 + +F2F: round +0x0000040000000000 ROUND +0x0000048000000000 FLOOR +0x0000050000000000 CEIL +0x0000058000000000 TRUNC + +F2I: round +0x0000000000000000 ROUND +0x0000008000000000 FLOOR +0x0000010000000000 CEIL +0x0000018000000000 TRUNC + +HADD2, HMUL2: r8part +0x0001000000000000 H0_H0 +0x0000000000000000 H1_H1 + +HFMA2: r20part +0x0000000020000000 H0_H0 +0x0000000030000000 H1_H1 + +FADD, DADD, FMUL, DMUL, F2F, I2F: rnd +0x0000000000000000 RN +0x0000008000000000 RM +0x0000010000000000 RP +0x0000018000000000 RZ + +DFMA: rnd +0x0000000000000000 RN +0x0004000000000000 RM +0x0008000000000000 RP +0x000c000000000000 RZ + +FFMA: rnd +0x0000000000000000 RN +0x0008000000000000 RM +0x0010000000000000 RP +0x0018000000000000 RZ + +FFMA +0x0020000000000000 FTZ + +F2F, F2I, FADD, FMUL, FMNMX +0x0000100000000000 FTZ + +FADD32I +0x0080000000000000 FTZ + +FMUL32I +0x0020000000000000 FTZ + +FSET +0x0080000000000000 FTZ + +FSETP, FCMP +0x0000800000000000 FTZ + +HADD2, HMUL2 +0x0000008000000000 FTZ + +HFMA2 +0x0000002000000000 FTZ + +FADD, FFMA, FMUL, F2F, I2I +0x0004000000000000 SAT + +FADD, DADD, FMNMX, DMNMX, MUFU +0x0001000000000000 r8neg + +FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I +0x0000200000000000 r20neg + +FMUL, DMUL, FFMA, DFMA +0x0001000000000000 r20neg + +FFMA, DFMA +0x0002000000000000 r39neg + +FADD, DADD, FMNMX, DMNMX +0x0000400000000000 r8abs + +FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I +0x0002000000000000 r20abs + +FSETP, DSETP, FSET, DSET +0x0000080000000000 r8neg +0x0000000000000040 r20neg +0x0000000000000080 r8abs +0x0000100000000000 r20abs + +RRO: func +0x0000000000000000 SINCOS +0x0000008000000000 EX2 + +MUFU: func +0x0000000000000000 COS +0x0000000000100000 SIN +0x0000000000200000 EX2 +0x0000000000300000 LG2 +0x0000000000400000 RCP +0x0000000000500000 RSQ +0x0000000000600000 RCP64H +0x0000000000700000 RSQ64H + +FSETP, DSETP, FSET, DSET, FCMP: cmp +0x0001000000000000 .LT +0x0002000000000000 .EQ +0x0003000000000000 .LE +0x0004000000000000 .GT +0x0004000000000000 +0x0005000000000000 .NE +0x0006000000000000 .GE +0x0007000000000000 .NUM +0x0008000000000000 .NAN +0x0009000000000000 .LTU +0x000a000000000000 .EQU +0x000b000000000000 .LEU +0x000c000000000000 .GTU +0x000d000000000000 .NEU +0x000e000000000000 .GEU + +FSETP, DSETP, FSET, DSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +HSETP2: cmp +0x0000002800000000 .NE + +HSETP2: bool +0x0000000000000000 AND + +S2R: sr +0x0000000000000000 LANEID +0x0000000000200000 VIRTCFG +0x0000000000300000 VIRTID +0x0000000002100000 TID.X +0x0000000002200000 TID.Y +0x0000000002300000 TID.Z +0x0000000002500000 CTAID.X +0x0000000002600000 CTAID.Y +0x0000000002700000 CTAID.Z +0x0000000003800000 EQMASK +0x0000000003900000 LTMASK +0x0000000003a00000 LEMASK +0x0000000003b00000 GTMASK +0x0000000003c00000 GEMASK + +CS2R: sr +0x0000000005000000 CLOCKLO +0x0000000005100000 CLOCKHI +0x0000000005200000 GLOBALTIMERLO +0x0000000005300000 GLOBALTIMERHI + +B2R +0x0000e00000000000 nop45 + +BAR +0x0000100000000000 i8w4 +0x0000080000000000 nor20 +0x0000038000000000 nop39 + +BAR: mode +0x0000000000000000 SYNC +0x0000000100000000 ARV +0x0000000200000000 RED + +BAR: red +0x0000000000000000 POPC +0x0000000800000000 AND +0x0000001000000000 OR + +MEMBAR: mode +0x0000000000000000 CTA +0x0000000000000100 GL +0x0000000000000200 SYS + +VOTE: mode +0x0000000000000000 ALL +0x0001000000000000 ANY +0x0002000000000000 EQ + +VOTE +0x00000000000000ff nor0 + +BRA +0x0000000000000080 U + +TLDS: chnls +0x0010000000000000 RGBA + +TLDS +0x0002000000000000 NODEP + +LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS +0x000000000000ff00 nor8 + +LD, ST: type +0x0000000000000000 .U8 +0x0020000000000000 .S8 +0x0040000000000000 .U16 +0x0060000000000000 .S16 +0x0080000000000000 +0x0080000000000000 .32 +0x00a0000000000000 .64 +0x00c0000000000000 .128 + +LD, ST: cache +0x0100000000000000 CG +0x0200000000000000 CS +0x0300000000000000 CV +0x0300000000000000 WT + +LDG, STG, LDS, STS, LDL, STL, LDC: type +0x0000000000000000 .U8 +0x0001000000000000 .S8 +0x0002000000000000 .U16 +0x0003000000000000 .S16 +0x0004000000000000 +0x0004000000000000 .32 +0x0005000000000000 .64 +0x0006000000000000 .128 + +LDG, STG: cache +0x0000400000000000 CG +0x0000800000000000 CI +0x0000800000000000 CS +0x0000c00000000000 CV +0x0000c00000000000 WT + +LDL: cache +0x0000200000000000 CI + +LDC: cache +0x0000100000000000 IL + +LDG, STG, LDS, STS, LDL, STL, LDC +0x0000200000000000 E + +LDS +0x0000100000000000 U + +RED: type +0x0000000000000000 +0x0000000000100000 .S32 +0x0000000000200000 .U64 +0x0000000000300000 .F32.FTZ.RN +0x0000000000400000 .F16x2.FTZ.RN +0x0000000000500000 .S64 + +RED: mode +0x0000000000000000 ADD +0x0000000000800000 MIN +0x0000000001000000 MAX +0x0000000001800000 INC +0x0000000002000000 DEC +0x0000000002800000 AND +0x0000000003000000 OR +0x0000000003800000 XOR + +ATOM: type +0x0000000000000000 +0x0002000000000000 .S32 +0x0004000000000000 .U64 +0x0006000000000000 .F32.FTZ.RN +0x0008000000000000 .F16x2.FTZ.RN +0x000a000000000000 .S64 +0x0002000000000000 .64 + +ATOM, RED +0x0001000000000000 E + +ATOM: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x03f0000000000000 CAS + +ATOMS: type +0x0000000000000000 +0x0000000010000000 .S32 +0x0000000020000000 .U64 +0x0000000030000000 .S64 +0x0010000000000000 .64 + +ATOMS: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x0240000000000000 CAS +}; + +# The existence of a capture group can map directly to an op code adjustment, or... +# The named capture group value can map the op code adjustmemt from among several options +our %flags; +my (@ops, $flag); +foreach my $line (@flags) +{ + if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)') + { + my $val = hex($1); + # named rules (op: name) + if ($flag) + { $flags{$_}{$flag}{$2} = $val foreach @ops; } + # simple existence check rules + else + { $flags{$_}{$2} = $val foreach @ops; } + } + else + { + my ($ops, $name) = split ':\s*', $line; + @ops = split ',\s*', $ops; + $flag = $name; + } +} + +sub parseInstruct +{ + my ($inst, $grammar) = @_; + return unless $inst =~ $grammar->{rule}; + my %capData = %+; + return \%capData; +} + +# for immediate or constant operands and a given opcode, bits 56-63 get transformed +my %immedOps = map { $_ => 1 } qw(i20 f20 d20); +my %immedCodes = +( + 0x5c => 0x64, + 0x5b => 0x6d, + 0x59 => 0x6b, + 0x58 => 0x68, +); +my %constCodes = +( + c20 => 0x10, + c39 => 0x08, +); +my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4); + +# just pick out the reuse code and nothing else +sub genReuseCode +{ + my $capData = shift; + my $reuse = 0; + $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes; + return $reuse; +} + +# Generate an op code from regex capture data +# if you pass in a test array ref it will populate it with the matching capture groups +sub genCode +{ + my ($op, $grammar, $capData, $test) = @_; + + my $flags = $flags{$op}; + my $code = $grammar->{code}; + my $reuse = 0; + my $immedCode = $immedCodes{$code >> 56}; + + #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I'; + + # process the instruction predicate (if valid for this instuction) + if (exists $capData->{noPred}) + { + delete $capData->{noPred}; + push @$test, 'noPred' if $test; + } + else + { + my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7; + push @$test, 'predNum' if $test; + if (exists $capData->{predNot}) + { + $p |= 8; + push @$test, 'predNot' if $test; + } + $code ^= $p << 16; + delete @{$capData}{qw(predNum predNot)}; + + } + # process the register reuse flags + foreach my $rcode (qw(reuse1 reuse2 reuse3)) + { + if (delete $capData->{$rcode}) + { + $reuse |= $reuseCodes{$rcode}; + push @$test, $rcode if $test; + } + } + + foreach my $capture (keys %$capData) + { + # change the base code for immediate versions of the op + if (exists $immedOps{$capture}) + { $code ^= $immedCode << 56; } + # change the base code for constant versions of the op + elsif (exists $constCodes{$capture}) + { $code ^= $constCodes{$capture} << 56; } + + # if capture group is an operand then process and add that data to code + if (exists $operands{$capture}) + { + # don't process the r20 that comes with the r39s20 capture + unless ($capture eq 'r20' && exists $capData->{r39s20}) + { + $code ^= $operands{$capture}->($capData->{$capture}); + push @$test, $capture if $test; + } + } + + # Add matching flags (an operand might also add/remove a flag) + if (exists $flags->{$capture}) + { + # a named multivalue flag + if (ref $flags->{$capture}) + { + $code ^= $flags->{$capture}{$capData->{$capture}}; + push @$test, "$capture:$capData->{$capture}" if $test; + } + # a simple exists flag + else + { + $code ^= $flags->{$capture}; + push @$test, $capture if $test; + } + } + elsif (!exists $operands{$capture} && !$test) + { + # Every capture group should be acted upon. Missing one is a bug. + warn "UNUSED: $op: $capture: $capData->{$capture}\n"; + warn Dumper($flags); + } + } + + return $code, $reuse; +} + + +my $CtrlRe = qr'(?[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])'; +my $PredRe = qr'(?@!?(?P\d)\s+)'; +my $InstRe = qr"$PredRe?(?\w+)(?[^;]*;)"o; +my $CommRe = qr'(?.*)'; + +sub processAsmLine +{ + my ($line, $lineNum) = @_; + + if ($line =~ m"^$CtrlRe(?\s+)$InstRe$CommRe"o) + { + return { + lineNum => $lineNum, + pred => $+{pred}, + predReg => $+{predReg}, + space => $+{space}, + op => $+{op}, + comment => $+{comment}, + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + ctrl => readCtrl($+{ctrl}, $line), + }; + } + return undef; +} + +sub processSassLine +{ + my $line = shift; + + if ($line =~ m"^\s+/\*(?[0-9a-f]+)\*/\s+$InstRe\s+/\* (?0x[0-9a-f]+)"o) + { + return { + num => hex($+{num}), + pred => $+{pred}, + op => $+{op}, + ins => normalizeSpacing($+{op} . $+{rest}), + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + code => hex($+{code}), + }; + } + return undef; +} + +sub processSassCtrlLine +{ + my ($line, $ctrl, $ruse) = @_; + + return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)'; + + my $code = hex($1); + if (ref $ctrl) + { + push @$ctrl, ($code & 0x000000000001ffff) >> 0; + push @$ctrl, ($code & 0x0000003fffe00000) >> 21; + push @$ctrl, ($code & 0x07fffc0000000000) >> 42; + } + if (ref $ruse) + { + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + } + return 1; +} + +sub replaceXMADs +{ + my $file = shift; + +# XMAD.LO d, a, b, c, x; +# ---------------------- +# XMAD.MRG x, a, b.H1, RZ; +# XMAD d, a, b, c; +# XMAD.PSL.CBCC d, a.H1, x.H1, d; +# ---------------------- +# XMAD d, a, 0xffff, c; +# XMAD.PSL d, a.H1, 0xffff, d; + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD\.LO\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s +%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s; +%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c x comment)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?-?$immed|\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/ + + die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + #TODO: add more XMAD macros + return $file; +} +# convert extra spaces to single spacing to make our re's simplier +sub normalizeSpacing +{ + my $inst = shift; + $inst =~ s/\t/ /g; + $inst =~ s/\s{2,}/ /g; + return $inst; +} + + +# map binary control notation on to easier to work with format. +sub printCtrl +{ + my $code = shift; + + my $stall = ($code & 0x0000f) >> 0; + my $yield = ($code & 0x00010) >> 4; + my $wrtdb = ($code & 0x000e0) >> 5; # write dependency barier + my $readb = ($code & 0x00700) >> 8; # read dependency barier + my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier + + $yield = $yield ? '-' : 'Y'; + $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1; + $readb = $readb == 7 ? '-' : $readb + 1; + $watdb = $watdb ? sprintf('%02x', $watdb) : '--'; + + return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall; +} +sub readCtrl +{ + my ($ctrl, $context) = @_; + my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl; + + $watdb = $watdb eq '--' ? 0 : hex $watdb; + $readb = $readb eq '-' ? 7 : $readb - 1; + $wrtdb = $wrtdb eq '-' ? 7 : $wrtdb - 1; + $yield = $yield eq 'y' || $yield eq 'Y' ? 0 : 1; + $stall = hex $stall; + + die sprintf('wait dep out of range(0x00-0x3f): %x at %s', $watdb, $context) if $watdb != ($watdb & 0x3f); + + return + $watdb << 11 | + $readb << 8 | + $wrtdb << 5 | + $yield << 4 | + $stall << 0; +} + +sub getRegNum +{ + my ($regMap, $regName) = @_; + + return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName}; +} + +sub getVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r0} or return; + + return if $regName eq 'RZ'; + + if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+3); + } + confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4; + return @{$vectors->{$regName}}; + } + return $regName; +} + +sub getAddrVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r8} or return; + + return if $regName eq 'RZ'; + + if (exists $capData->{E}) + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + print Dumper($vectors) unless exists $vectors->{$regName}; + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + return $regName; +} + +__END__ + + + diff --git a/Assembler/MaxAs/blib/lib/auto/MaxAs/MaxAs/.exists b/Assembler/MaxAs/blib/lib/auto/MaxAs/MaxAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/blib/man1/.exists b/Assembler/MaxAs/blib/man1/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/blib/man3/.exists b/Assembler/MaxAs/blib/man3/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/blib/man3/MaxAs::MaxAs.3pm b/Assembler/MaxAs/blib/man3/MaxAs::MaxAs.3pm new file mode 100644 index 0000000..9f95fff --- /dev/null +++ b/Assembler/MaxAs/blib/man3/MaxAs::MaxAs.3pm @@ -0,0 +1,170 @@ +.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.13) +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" Set up some character translations and predefined strings. \*(-- will +.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left +.\" double quote, and \*(R" will give a right double quote. \*(C+ will +.\" give a nicer C++. Capital omega is used to do unbreakable dashes and +.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, +.\" nothing in troff, for use with C<>. +.tr \(*W- +.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' +.ie n \{\ +. ds -- \(*W- +. ds PI pi +. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch +. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch +. ds L" "" +. ds R" "" +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds -- \|\(em\| +. ds PI \(*p +. ds L" `` +. ds R" '' +'br\} +.\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" +.\" If the F register is turned on, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.ie \nF \{\ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. nr % 0 +. rr F +.\} +.el \{\ +. de IX +.. +.\} +.\" +.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). +.\" Fear. Run. Save yourself. No user-serviceable parts. +. \" fudge factors for nroff and troff +.if n \{\ +. ds #H 0 +. ds #V .8m +. ds #F .3m +. ds #[ \f1 +. ds #] \fP +.\} +.if t \{\ +. ds #H ((1u-(\\\\n(.fu%2u))*.13m) +. ds #V .6m +. ds #F 0 +. ds #[ \& +. ds #] \& +.\} +. \" simple accents for nroff and troff +.if n \{\ +. ds ' \& +. ds ` \& +. ds ^ \& +. ds , \& +. ds ~ ~ +. ds / +.\} +.if t \{\ +. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" +. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' +. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' +. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' +. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' +. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' +.\} +. \" troff and (daisy-wheel) nroff accents +.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' +.ds 8 \h'\*(#H'\(*b\h'-\*(#H' +.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] +.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' +.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' +.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] +.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] +.ds ae a\h'-(\w'a'u*4/10)'e +.ds Ae A\h'-(\w'A'u*4/10)'E +. \" corrections for vroff +.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' +.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' +. \" for low resolution devices (crt and lpr) +.if \n(.H>23 .if \n(.V>19 \ +\{\ +. ds : e +. ds 8 ss +. ds o a +. ds d- d\h'-1'\(ga +. ds D- D\h'-1'\(hy +. ds th \o'bp' +. ds Th \o'LP' +. ds ae ae +. ds Ae AE +.\} +.rm #[ #] #H #V #F C +.\" ======================================================================== +.\" +.IX Title "MaxAs::MaxAs 3" +.TH MaxAs::MaxAs 3 "2016-02-04" "perl v5.10.1" "User Contributed Perl Documentation" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh +.SH "NAME" +MaxAs::MaxAs \- Assembler for NVIDIA Maxwell architecture +.SH "SYNOPSIS" +.IX Header "SYNOPSIS" +.Vb 1 +\& maxas.pl [opts] +.Ve +.SH "DESCRIPTION" +.IX Header "DESCRIPTION" +See the documentation at: https://github.com/NervanaSystems/maxas +.SH "SEE ALSO" +.IX Header "SEE ALSO" +See the documentation at: https://github.com/NervanaSystems/maxas +.SH "AUTHOR" +.IX Header "AUTHOR" +Scott Gray, +.SH "COPYRIGHT AND LICENSE" +.IX Header "COPYRIGHT AND LICENSE" +The \s-1MIT\s0 License (\s-1MIT\s0) +.PP +Copyright (c) 2014 Scott Gray +.PP +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the \*(L"Software\*(R"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +.PP +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +.PP +\&\s-1THE\s0 \s-1SOFTWARE\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R", \s-1WITHOUT\s0 \s-1WARRANTY\s0 \s-1OF\s0 \s-1ANY\s0 \s-1KIND\s0, \s-1EXPRESS\s0 \s-1OR\s0 +\&\s-1IMPLIED\s0, \s-1INCLUDING\s0 \s-1BUT\s0 \s-1NOT\s0 \s-1LIMITED\s0 \s-1TO\s0 \s-1THE\s0 \s-1WARRANTIES\s0 \s-1OF\s0 \s-1MERCHANTABILITY\s0, +\&\s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0 \s-1AND\s0 \s-1NONINFRINGEMENT\s0. \s-1IN\s0 \s-1NO\s0 \s-1EVENT\s0 \s-1SHALL\s0 \s-1THE\s0 +\&\s-1AUTHORS\s0 \s-1OR\s0 \s-1COPYRIGHT\s0 \s-1HOLDERS\s0 \s-1BE\s0 \s-1LIABLE\s0 \s-1FOR\s0 \s-1ANY\s0 \s-1CLAIM\s0, \s-1DAMAGES\s0 \s-1OR\s0 \s-1OTHER\s0 +\&\s-1LIABILITY\s0, \s-1WHETHER\s0 \s-1IN\s0 \s-1AN\s0 \s-1ACTION\s0 \s-1OF\s0 \s-1CONTRACT\s0, \s-1TORT\s0 \s-1OR\s0 \s-1OTHERWISE\s0, \s-1ARISING\s0 \s-1FROM\s0, +\&\s-1OUT\s0 \s-1OF\s0 \s-1OR\s0 \s-1IN\s0 \s-1CONNECTION\s0 \s-1WITH\s0 \s-1THE\s0 \s-1SOFTWARE\s0 \s-1OR\s0 \s-1THE\s0 \s-1USE\s0 \s-1OR\s0 \s-1OTHER\s0 \s-1DEALINGS\s0 \s-1IN\s0 +\&\s-1THE\s0 \s-1SOFTWARE\s0. diff --git a/Assembler/MaxAs/blib/script/.exists b/Assembler/MaxAs/blib/script/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/blib/script/maxas.pl b/Assembler/MaxAs/blib/script/maxas.pl new file mode 100755 index 0000000..91cfa30 --- /dev/null +++ b/Assembler/MaxAs/blib/script/maxas.pl @@ -0,0 +1,289 @@ +#!/usr/bin/perl + +eval 'exec /usr/bin/perl -S $0 ${1+"$@"}' + if 0; # not running under some shell +use strict; +use MaxAs::Cubin; +use MaxAs::MaxAs; +use Data::Dumper; +use File::Spec; + +require 5.10.0; + +$Data::Dumper::Sortkeys = 1; + +my $mode = shift; + +# List cubin contents +if ($mode =~ /^\-?\-l/i) +{ + my $cubinFile = shift or usage(); + + my $cubin = MaxAs::Cubin->new($cubinFile); + + my $arch = $cubin->arch; + my $class = $cubin->class; + my $asize = $cubin->address_size; + my $kernels = $cubin->listKernels; + my $symbols = $cubin->listSymbols; + + printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize; + + foreach my $ker (sort keys %$kernels) + { + printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)}; + } + foreach my $sym (sort keys %$symbols) + { + printf "Symbol: %s\n", $sym; + } +} +# Test that the assembler can reproduce the op codes this cubin or sass contains +elsif ($mode =~ /^\-?\-t/i) +{ + my $reg = shift if $ARGV[0] =~ /^\-?\-r/i; + my $all = shift if $ARGV[0] =~ /^\-?\-a/i; + my $file = shift or usage(); + my $fh; + # sass file + if (-T $file) + { + open $fh, $file or die "$file: $!"; + } + # cubin file + else + { + my $cubin = MaxAs::Cubin->new($file); + my $arch = $cubin->arch; + + open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!"; + my $first = <$fh>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + } + exit(MaxAs::MaxAs::Test($fh, $reg, $all) ? 1 : 0); +} +# Extract an asm file containing the desired kernel +elsif ($mode =~ /^\-?\-e/i) +{ + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $cubinFile = shift or usage(); + my $asmFile = shift; + my $cubin = MaxAs::Cubin->new($cubinFile); + my $arch = $cubin->arch; + my $kernels = $cubin->listKernels; + + #default the kernel name if not specified. + $kernelName ||= (sort keys %$kernels)[0]; + + my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName"; + + open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_50 -sass -fun $kernelName $cubinFile: $!"; + my $first = <$in>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n"; + + print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt)); + + print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n"; + + print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params}; + + print $out "#\n# Instructions:\n\n"; + + MaxAs::MaxAs::Extract($in, $out, $kernel->{Params}); + + close $out if $asmFile; + close $in; +} +# Extract a kernel from a sass dump +elsif ($mode =~ /^\-?\-s/i) +{ + my $sassFile = shift or usage(); + my $asmFile = shift; + + open my $in, $sassFile or die "$sassFile: $!"; + + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + MaxAs::MaxAs::Extract($in, $out, []); + + close $out if $asmFile; + close $in; +} +# Insert the kernel asm back into the cubin: +elsif ($mode =~ /^\-?\-i/i) +{ + my $nowarn; + if ($ARGV[0] =~ /^\-?\-w/i) + { + $nowarn = shift; + } + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $noReuse = shift if $ARGV[0] =~ /^\-?\-n/i; + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';" + } + + my $asmFile = shift or usage(); + my $cubinFile = shift or usage(); + my $newCubin = shift || $cubinFile; + + my $file; + if (open my $fh, $asmFile) + { + local $/; + $file = <$fh>; + close $fh; + } + else { die "$asmFile: $!" } + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + # extract the kernel name from the file + ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName; + die "asm file missing kernel name or is badly formatted" unless $kernelName; + + my $kernel = MaxAs::MaxAs::Assemble($file, $include, !$noReuse, $nowarn); + + my $cubin = MaxAs::Cubin->new($cubinFile); + $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName"; + + $cubin->modifyKernel(%$kernel); + + $cubin->write($newCubin); + + printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n", + @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)}; + +} +# Preprocessing: +elsif ($mode =~ /^\-?\-p/i) +{ + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"; + } + my $debug = shift if $ARGV[0] =~ /^\-?\-d/i; + my $asmFile = shift or usage(); + my $asmFile2 = shift; + + die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2; + + open my $fh, $asmFile or die "$asmFile: $!"; + local $/; + my $file = <$fh>; + close $fh; + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + if ($asmFile2) + { + open $fh, ">$asmFile2" or die "$asmFile2: $!"; + } + else + { + $fh = \*STDOUT; + } + print $fh MaxAs::MaxAs::Preprocess($file, $include, $debug); + close $fh; +} +# get version information +elsif ($mode =~ /^\-?\-v/i) +{ + print "$MaxAs::MaxAs::VERSION\n"; +} +else +{ + print "$mode\n"; + usage(); +} + +exit(0); + + + +sub usage +{ + print < + + Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes. + Also useful for extending the missing grammar rules. Defaults to only showing failures without --all. + With the --reg flag it will show register bank conflicts not hidden by reuse flags. + + maxas.pl --test|-t [--reg|-r] [--all|-a] + + Extract a single kernel into an asm file from a cubin. + Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin. + + maxas.pl --extract|-e [--kernel|-k kernel_name] [asm_file] + + Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes. + Include the debug flag to print out detailed scheduler info. + + maxas.pl --pre|-p [--debug|-d] [new_asm_file] + + Insert the kernel asm back into the cubin. Overwrite existing or create new cubin. + Optionally you can skip register reuse flag auto insertion. This allows you to observe + performance without any reuse or you can use it to set the flags manually in your sass. + + maxas.pl --insert|-i [--noreuse|-n] [new_cubin_file] + + Display version information and exit: + + maxas.pl --version|-v + +EOF + exit(1); +} + +__END__ diff --git a/Assembler/MaxAs/cpanfile b/Assembler/MaxAs/cpanfile new file mode 100644 index 0000000..e8281c5 --- /dev/null +++ b/Assembler/MaxAs/cpanfile @@ -0,0 +1,4 @@ +requires 'perl', '5.10.0'; + +requires 'Carp', '1.29'; +requires 'Data::Dumper', '2.145'; diff --git a/Assembler/MaxAs/lib/MaxAs/Cubin.pm b/Assembler/MaxAs/lib/MaxAs/Cubin.pm new file mode 100644 index 0000000..5900958 --- /dev/null +++ b/Assembler/MaxAs/lib/MaxAs/Cubin.pm @@ -0,0 +1,684 @@ +package MaxAs::Cubin; + +use strict; +use Data::Dumper; + +my @Elf32_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + L entry + L phOffset + L shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf64_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + Q entry + Q phOffset + Q shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf32_PrgHdr = qw( + L type + L offset + L vaddr + L paddr + L fileSize + L memSize + L flags + L align +); +my @Elf64_PrgHdr = qw( + L type + L flags + Q offset + Q vaddr + Q paddr + Q fileSize + Q memSize + Q align +); +my @Elf32_SecHdr = qw( + L name + L type + L flags + L addr + L offset + L size + L link + L info + L align + L entSize +); +my @Elf64_SecHdr = qw( + L name + L type + Q flags + Q addr + Q offset + Q size + L link + L info + Q align + Q entSize +); +my @Elf32_SymEnt = qw( + L name + L value + L size + C info + C other + S shIndx +); +my @Elf64_SymEnt = qw( + L name + C info + C other + S shIndx + Q value + Q size +); +my @symBind = qw(LOCAL GLOBAL WEAK); + +# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C) +my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC); + +$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr; +$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr; +$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr; +$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt; + +$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr; +$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr; +$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr; +$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt; + +$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr ]; +$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ]; +$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ]; +$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ]; + +$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr ]; +$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ]; +$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ]; +$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ]; + +# Load a cubin ELF file +sub new +{ + my ($package, $file) = @_; + + my $cubin = bless { fileName => $file }, $package; + + open my $fh, $file or die "$file: $!"; + binmode($fh); + + # Read in assuming 32 bit header + my $data; + read $fh, $data, 0x34; + my $elfHdr = $cubin->{elfHdr} = {}; + @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data; + + # 1: 32bit, 2: 64bit + my $class = $elfHdr->{fileClass}; + + # re-read in with 64 bit header if needed + if ($class == 2) + { + seek $fh, 0, 0; + read $fh, $data, 0x46; + @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data; + + $cubin->{Class} = 64; + } + else + { + $cubin->{Class} = 32; + } + + # verify sm_50 cubin + $cubin->{Arch} = $elfHdr->{flags} & 0xFF; + die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50; + + $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32; + + # Read in Program Headers + seek $fh, $elfHdr->{phOffset}, 0; + foreach (1 .. $elfHdr->{phNum}) + { + read $fh, $data, $elfHdr->{phEntSize}; + + my %prgHdr = (Indx => $_ - 1); + @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data; + push @{$cubin->{prgHdrs}}, \%prgHdr; + } + + # Read in Section Headers + seek $fh, $elfHdr->{shOffset}, 0; + foreach (1 .. $elfHdr->{shNum}) + { + read $fh, $data, $elfHdr->{shEntSize}; + + my %secHdr = (Indx => $_ - 1); + @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data; + push @{$cubin->{secHdrs}}, \%secHdr; + } + + # Read in Section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $data = ''; + # Skip sections with no data (type NULL or NOBITS) + if ($secHdr->{size} && $secHdr->{type} != 8) + { + seek $fh, $secHdr->{offset}, 0; + read $fh, $data, $secHdr->{size}; + } + # Convert string tables to maps + if ($secHdr->{type} == 3) # STRTAB + { + my $strTab = $secHdr->{StrTab} = {}; + my $indx = 0; + foreach my $str (split "\0", $data) + { + $strTab->{$indx} = $str; + $indx += 1 + length($str); + } + } + # Read in Symbol data + if ($secHdr->{type} == 2) # SYMTAB + { + my $offset = 0; + while ($offset < $secHdr->{size}) + { + my $symEnt = {}; + @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize}); + $offset += $secHdr->{entSize}; + + push @{$secHdr->{SymTab}}, $symEnt; + } + } + # Cache raw data for further processing and writing + $secHdr->{Data} = unpack 'H*', $data; + } + close $fh; + + # Update section headers with their names. Map names directly to headers. + my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab}; + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $secHdr->{Name} = $shStrTab->{$secHdr->{name}}; + $cubin->{$secHdr->{Name}} = $secHdr; + } + + # Update symbols with their names + # For the Global functions, extract kernel meta data + # Populate the kernel hash + my $strTab = $cubin->{'.strtab'}{StrTab}; + foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}}) + { + $symEnt->{Name} = $strTab->{$symEnt->{name}}; + + # Attach symbol to section + my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}]; + $secHdr->{SymbolEnt} = $symEnt; + + # Look for symbols tagged FUNC + if (($symEnt->{info} & 0x0f) == 0x02) + { + # Create a hash of kernels for output + my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr; + + # Extract local/global/weak binding info + $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4]; + + # Extract the kernel instructions + $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ]; + + # Extract the max barrier resource identifier used and add 1. Should be 0-16. + # If a register is used as a barrier resource id, then this value is the max of 16. + $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20; + + # Extract the number of allocated registers for this kernel. + $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24; + + # Extract the size of shared memory this kernel uses. + my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"}; + $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0; + + # Attach constant0 section + $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"}; + + # Extract the kernel parameter data. + my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"}; + if ($paramSec) + { + # Extract raw param data + my @data = unpack "L*", pack "H*", $paramSec->{Data}; + + $paramSec->{ParamData} = \@data; + $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ]; + + # Find the first param delimiter + my $idx = 0; + $idx++ while $idx < @data && $data[$idx] != 0x00080a04; + + my $first = $data[$idx+2] & 0xFFFF; + #my $size = $data[$idx+2] >> 16; + $idx += 4; + + my @params; + while ($idx < @data && $data[$idx] == 0x000c1704) + { + # Get the ordinal, offset, size and pointer alignment for each param + my $ord = $data[$idx+2] & 0xFFFF; + my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16); + my $psize = $data[$idx+3] >> 18; + my $align = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0; + unshift @params, "$ord:$offset:$psize:$align"; + $idx += 4; + } + my @staticParams = @data[0 .. ($idx-1)]; + + my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize); + while ($idx < @data) + { + my $code = $data[$idx] & 0xffff; + my $size = $data[$idx] >> 16; + $idx++; + + # EIATTR_MAXREG_COUNT + if ($code == 0x1b03) + { + $maxregCount = $size; + } + # EIATTR_S2RCTAID_INSTR_OFFSETS + elsif ($code == 0x1d04) + { + while ($size > 0) + { + push @ctaidOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_EXIT_INSTR_OFFSETS + elsif ($code == 0x1c04) + { + while ($size > 0) + { + push @exitOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CTAIDZ_USED + elsif ($code == 0x0401) + { + $ctaidzUsed = 1; + } + # EIATTR_REQNTID + elsif ($code == 0x1004) + { + while ($size > 0) + { + push @reqntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_MAX_THREADS + elsif ($code == 0x0504) + { + while ($size > 0) + { + push @maxntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CRS_STACK_SIZE + elsif ($code == 0x1e04) + { + while ($size > 0) + { + push @stackSize, $data[$idx++]; + $size -= 4; + } + } + else + { + printf STDERR "Unknown Code 0x%02x (size:%d)\n", $code, $size; + } + } + $kernelSec->{Params} = \@params; + $kernelSec->{ParamCnt} = scalar @params; + + $paramSec->{StaticParams} = \@staticParams; + $paramSec->{MAXREG_COUNT} = $maxregCount; + $paramSec->{ExitOffsets} = \@exitOffsets; + $paramSec->{CTAIDOffsets} = \@ctaidOffsets; + $paramSec->{CTAIDZUsed} = $ctaidzUsed; + $paramSec->{REQNTID} = \@reqntid; + $paramSec->{MAXNTID} = \@maxntid; + $paramSec->{STACKSIZE} = \@stackSize; + } + # print Dumper($paramSec); + # exit(); + } + # Note GLOBALs found in this cubin + elsif (($symEnt->{info} & 0x10) == 0x10) + { + $cubin->{Symbols}{$symEnt->{Name}} = $symEnt; + } + } + + # print "phOffset: $elfHdr->{phOffset}\n"; + # print "shOffset: $elfHdr->{shOffset}\n"; + # foreach my $secHdr (@{$cubin->{secHdrs}}) + # { + # print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n"; + # } + # my $p = 0; + # foreach my $prgHdr (@{$cubin->{prgHdrs}}) + # { + # print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n"; + # $p++; + # } + # exit(); + + # print Dumper($cubin->{prgHdrs}); + # exit(); + return $cubin; +} +sub class +{ + return shift()->{Class}; +} +sub arch +{ + return shift()->{Arch}; +} +sub address_size +{ + return shift()->{AddressSize}; +} +sub listKernels +{ + return shift()->{Kernels}; +} +sub listSymbols +{ + return shift()->{Symbols}; +} +sub getKernel +{ + my ($cubin, $kernel) = @_; + return $cubin->{Kernels}{$kernel}; +} + +sub modifyKernel +{ + my ($cubin, %params) = @_; + + my $kernelSec = $params{Kernel}; + my $newReg = $params{RegCnt}; + my $newBar = $params{BarCnt}; + my $exitOffsets = $params{ExitOffsets}; + my $ctaidOffsets = $params{CTAIDOffsets}; + my $ctaidzUsed = $params{CTAIDZUsed}; + my $newData = $params{KernelData}; + my $newSize = @$newData * 8; + + die "255 register max" if $newReg > 255; + die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63; + die "16 is max barrier count" if $newBar > 16; + + my $paramSec = $kernelSec->{ParamSec}; + my $kernelName = $kernelSec->{SymbolEnt}{Name}; + my $maxregCount = $paramSec->{MAXREG_COUNT}; + my $stackSize = $paramSec->{STACKSIZE}; + + # update the kernel + $kernelSec->{KernelData} = $newData; + $kernelSec->{Data} = unpack "H*", pack "Q*", @$newData; + + if ($newReg != $kernelSec->{RegCnt}) + { + print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n"; + $kernelSec->{RegCnt} = $newReg; + $kernelSec->{info} &= ~0xff000000; + $kernelSec->{info} |= $newReg << 24; + } + if ($newBar != $kernelSec->{BarCnt}) + { + print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n"; + $kernelSec->{BarCnt} = $newBar; + $kernelSec->{flags} &= ~0x01f00000; + $kernelSec->{flags} |= $newBar << 20; + } + + my @paramData = @{$paramSec->{StaticParams}}; + + if (defined $maxregCount) + { + push @paramData, ($maxregCount << 16) | 0x1b03; + } + + my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets; + my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}}; + + if ($newCTAIDs ne $oldCTAIDs) + { + print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n"; + } + if (@$ctaidOffsets) + { + push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04; + push @paramData, @$ctaidOffsets; + } + + my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets; + my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}}; + + if ($newExits ne $oldExits) + { + print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n"; + } + if (@$exitOffsets) + { + push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04; + push @paramData, @$exitOffsets; + } + + if ($ctaidzUsed != $paramSec->{CTAIDZUsed}) + { + print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n"; + } + if ($ctaidzUsed) + { + push @paramData, 0x0401; + } + + if (@{$paramSec->{REQNTID}}) + { + push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004; + push @paramData, @{$paramSec->{REQNTID}}; + } + if (@{$paramSec->{MAXNTID}}) + { + push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504; + push @paramData, @{$paramSec->{MAXNTID}}; + } + + if (@$stackSize) + { + push @paramData, (scalar(@$stackSize) << 18) | 0x1e04; + push @paramData, @$stackSize; + } + + my $newParamSize = scalar(@paramData)*4; + $paramSec->{Data} = unpack "H*", pack "L*", @paramData; + if ($newParamSize != $paramSec->{size}) + { + print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n"; + $cubin->updateSize($paramSec, $newParamSize); + } + + if ($newSize != $kernelSec->{size}) + { + print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n"; + $cubin->updateSize($kernelSec, $newSize, 1); + } +} + +sub updateSize +{ + my ($cubin, $sec, $newSize, $updatePrgSize) = @_; + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # update section header + my $delta = $newSize - $sec->{size}; + $sec->{size} = $newSize; + + # update symtab section + if ($sec->{SymbolEnt}) + { + $sec->{SymbolEnt}{size} = $newSize; + my $symSection = $cubin->{'.symtab'}; + $symSection->{Data} = ''; + foreach my $symEnt (@{$symSection->{SymTab}}) + { + $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}}; + } + } + + my $pos = $elfHdr->{ehSize}; + my %sizeMap; + + # update section header offsets + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # skip first header + next if $secHdr->{align} == 0; + + # NOBITS data sections are size 0 + my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size}; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pos += $secHdr->{align} - $pad; + } + # map old offset to new + $sizeMap{$secHdr->{offset}} = $pos; + + # update offset + $secHdr->{offset} = $pos; + + # advance position by size + $pos += $size; + } + + # compute total section header size + my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset}; + + # map old offset to new + $sizeMap{$elfHdr->{shOffset}} = $pos; + $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize; + + $elfHdr->{shOffset} = $pos; + $elfHdr->{phOffset} = $pos + $shSize; + + # update program header offsets and sizes + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + # Not sure how best to adjust these so just assume they'll track other offsets. + $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}}; + + # If the kernel sizes changes, also update the associated ProgramHeader. + # Note that this size is the kernel size plus any constant section sizes. + if ($updatePrgSize && $prgHdr->{type} == 1 && + $sec->{offset} >= $prgHdr->{offset} && + $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta) + { + $prgHdr->{fileSize} += $delta; + $prgHdr->{memSize} += $delta; + } + } +} + +# Write out the cubin after modifying it. +sub write +{ + my ($cubin, $file) = @_; + + open my $fh, ">$file" or die "Error: could not open $file for writing: $!"; + binmode($fh); + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # write elf header + print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}}; + my $pos = $elfHdr->{ehSize}; + + # write section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # Skip NULL and NOBITS data sections + next if $secHdr->{size} == 0 || $secHdr->{type} == 8; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pad = $secHdr->{align} - $pad; + print $fh join '', "\0" x $pad; + $pos += $pad; + } + + print $fh pack 'H*', $secHdr->{Data}; + $pos += $secHdr->{size}; + } + + # write section headers + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}}; + } + + #write program headers + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}}; + } + close $fh; +} + +__END__ + diff --git a/Assembler/MaxAs/lib/MaxAs/MaxAs.pm b/Assembler/MaxAs/lib/MaxAs/MaxAs.pm new file mode 100644 index 0000000..ac79952 --- /dev/null +++ b/Assembler/MaxAs/lib/MaxAs/MaxAs.pm @@ -0,0 +1,2105 @@ +package MaxAs::MaxAs; + +require 5.10.0; + +use strict; +use Data::Dumper; +use MaxAs::MaxAsGrammar; +use File::Spec; +use Carp; +use POSIX; +use List::Util qw[min max]; + +our $VERSION = '1.06'; + +# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump +my %relOffset = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT); + +# these ops use absolute addresses +my %absOffset = map { $_ => 1 } qw(JCAL); + +my %jumpOp = (%relOffset, %absOffset); + +# These instructions use r0 but do not write to r0 +my %noDest = map { $_ => 1 } qw(ST STG STS STL RED); + +# Map register slots to reuse control codes +my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4); + +# break the registers down into source and destination categories for the scheduler +my %srcReg = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X); +my %destReg = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC); +my %regops = (%srcReg, %destReg); +my @itypes = qw(class lat rlat tput dual); + +# init resource usage +my $activeWarp = 1; +my $scheduler = 2; +my $warpSize = 32; +my $bankWidth = 4; +my $maxThreads = 1024; +my $maxSharedMem = 49152; +my $maxReg = 65536; + +my $AnalyzeRe = qr'^[\t ]*(.*?)^\s*\n?'ms; + +sub Occupancy +{ + my ($fileName) = @_; + + print "Occupancy\n"; + + local $/ = "\n"; + open my $fh, "<", $fileName or die "Cannot open: ", $fileName; + my $usedThreads = <$fh>; + chomp $usedThreads; + $usedThreads =~ s/threads=//g; + + my $usedSharedMem = <$fh>; + chomp $usedSharedMem; + $usedSharedMem =~ s/shared=//g; + + my $usedReg = <$fh>; + chomp $usedReg; + $usedReg =~ s/regs=//g; + + my $activeBlock = min(ceil($maxThreads / $usedThreads), + ceil($maxSharedMem / $usedSharedMem), ceil(ceil($maxReg / $usedReg) / $usedThreads)); + $activeWarp = $activeBlock * ceil($usedThreads / $warpSize); + + print "Active Blocks: ", $activeBlock, "\n"; + print "Active Warps: ", $activeWarp, "\n\n\n"; + close $fh; +} + +sub LongestPath +{ + my ($instructs) = @_; + + # calculate longest path + my @path; + foreach my $i (0 .. $#$instructs) + { + push @path, 0; + } + + foreach my $i (0 .. $#$instructs) + { + my $instruct = $instructs->[$i]; + foreach my $child (@{$instruct->{children}}) { + my $ins = @$child[0]; + my $weight = @$child[1]; + $path[$ins] = $weight + $path[$i] if $weight + $path[$i] > $path[$ins]; + } + } + + my $longestPath = 0; + foreach my $i (0 .. $#$instructs) + { + $longestPath = $path[$i] if $path[$i] > $longestPath; + } + + return $longestPath; +} + +sub PreprocessBlock +{ + my ($analyzeBlock) = @_; + my ($lineNum, @instructs, @branches, %labels); + + # push first dummy instruct + push @instructs, {dualCnt=>0, nodual=>1}; + + # Preprocess instructions + foreach my $line (split "\n", $analyzeBlock) + { + # keep track of line nums in the physical file + $lineNum++; + + next unless preProcessLine($line); + + # Match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # Save us from crashing the display driver + die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n" + if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0; + + # track branches/jumps/calls/etc for label remapping + push @branches, @instructs+0 if exists $jumpOp{$inst->{op}}; + + # add the op name and full instruction text + push @instructs, $inst; + } + # Match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + # map the label name to the index of the instruction about to be inserted + $labels{$1} = @instructs+0; + } + else + { + die "badly formed line at $lineNum: $line\n"; + } + } + + # remap labels + foreach my $i (@branches) + { + if (exists $relOffset{$instructs[$i]{op}}) + { + $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; + } + else + { + $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; + } + } + + return @instructs; +} + +sub CalculateEfficiency +{ + my ($instructs) = @_; + print "Instructions\tDispatches\tEcompute\tEcmp\tEmem\n"; + + # Analyze efficiency + foreach my $i (0 .. $#$instructs) + { + my $instruct = $instructs->[$i]; + $instruct->{dualCnt} = 0; + $instruct->{nodual} = 1; + + next unless $i != 0; + + my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)}; + + foreach my $gram (@{$grammar{$op}}) + { + my $capData = parseInstruct($inst, $gram) or next; + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + $instruct->{dualCnt} = $instruct->{dual} ? 1 : 0; + + # Handle P2R and R2P specially + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + # These instructions can't be dual issued + $instruct->{nodual} = 1; + } + + # For pascal and maxwell + my $dispatches = 1; + my $instructType = $gram->{type}; + if ($instructType->{class} eq 'x32' || $instructType->{class} eq 's2r' || + $instructType->{class} eq 'qtr' || $instructType->{class} eq 'rro' || + $instructType->{class} eq 'vote') + { + my $units = $instructType->{units}; + $instruct->{efficiency} = 1 / ceil(($dispatches * $warpSize) / $units); + } + elsif ($instructType->{class} eq 'shift' || $instructType->{class} eq 'cmp') + { + my $units = $instructType->{units}; + my $tput = $instructType->{tput}; + $instruct->{efficiency} = 1 / (ceil(($dispatches * $warpSize) / $units) * $tput); + } + elsif ($instructType->{class} eq 'mem') + { + my $units = $instructType->{units}; + my $memType = $capData->{type}; + my $issue = 1; + # vector instruction + if ($memType =~ s/^\.//g) + { + $issue *= $memType / $warpSize; + } + # TODO(keren): cache instruction ??? + if ($op eq 'LDG') + { + $issue = 1; + } + $instruct->{efficiency} = 1 / ceil(($dispatches * $warpSize) / $units * $issue); + } + else + { + die "No such instruct type: ", Dumper($instruct); + } + if ($i > 1 and $instruct->{dual}) { + my ($prevOp) = @{$instructs->[$i - 1]}{qw(op)}; + foreach my $prevGram (@{$grammar{$prevOp}}) + { + #TODO(keren): not noly same class, but also same units + if ($prevGram->{type}->{class} eq $instructType->{class}) + { + #TODO(keren): ceil? + $instructs->[$i - 1]->{efficiency} = $instruct->{efficiency} = + 1 / 2 * $instruct->{efficiency}; + } + } + } + } + } + foreach my $i (0 .. $#$instructs) + { + next unless $i > 0; + + my $instruct = $instructs->[$i]; + my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)}; + + foreach my $gram (@{$grammar{$op}}) + { + my $dispatches = 1; + print "\t" if $instruct->{dualCnt}; + print $inst, "\t", $dispatches, "\t"; + my $instructType = $gram->{type}; + if ($instructType->{class} eq 'x32' || $instructType->{class} eq 's2r' || + $instructType->{class} eq 'qtr' || $instructType->{class} eq 'rro' || + $instructType->{class} eq 'vote') + { + print $instruct->{efficiency}, "\t0\t0"; + } + elsif ($instructType->{class} eq 'shift' || $instructType->{class} eq 'cmp') + { + print "0\t", $instruct->{efficiency}, "\t0"; + } + elsif ($instructType->{class} eq 'mem') + { + # TODO(keren): simulate + print "0\t0\t", $instruct->{efficiency}; + } + else + { + die "No such instruct type: ", Dumper($instruct); + } + print "\n"; + } + } +} + +sub AnalyzeDAG +{ + my ($instructs, $effInstructs, $regMap) = @_; + my $vectors = $regMap->{__vectors}; + my %deps; + + # efficiency dependencies + foreach my $i (0 .. $#$instructs) + { + next unless $i != 0; + my $instruct = $instructs->[$i]; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $parent = $instructs->[$i - 1]; + my $effParent = $effInstructs->[$i - 1]; + my $instructType = $gram->{type}; + if ($parent->{dualCnt} == 1) # parent dual + { + if ($instruct->{dualCnt} == 0) # links to parent and grandparent + { + my $grandparent = $instructs->[$i - 2]; + my $effGrandparent = $effInstructs->[$i - 2]; + push @{$parent->{children}}, [$i, 1 / $instruct->{efficiency}]; + push @{$grandparent->{children}}, [$i, 1 / $instruct->{efficiency}]; + push @{$effParent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType]; + push @{$effGrandparent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}]; + } + else # not recommend issue pattern, TODO(keren): cannot dual in this way? + { + my $grandparent = $instructs->[$i - 2]; + my $effGrandparent = $effInstructs->[$i - 2]; + if ($grandparent->{dualCnt} == 0) + { # links to grandparent and parent + push @{$parent->{children}}, [$i, 1 / $instruct->{efficiency}]; + push @{$grandparent->{children}}, [$i, 1 / $instruct->{efficiency}]; + push @{$effParent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}]; + push @{$effGrandparent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}]; + } + else + { # links to parent becuase it is illegal + push @{$parent->{children}}, [$i, 1 / $instruct->{efficiency}]; + push @{$effParent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}]; + } + } + } + elsif ($parent->{dualCnt} == 0) # parent single + { + if ($instruct->{dualCnt} == 0) # links to parent + { + push @{$parent->{children}}, [$i, 1 / $instruct->{efficiency}]; + push @{$effParent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}]; + } + else # links to grandparent + { + my $grandparent = $instructs->[$i - 2]; + my $effGrandparent = $effInstructs->[$i - 2]; + push @{$grandparent->{children}}, [$i, 1 / $instruct->{efficiency}]; + push @{$effGrandparent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}]; + } + } + } + } + + foreach my $i (0 .. $#$instructs) + { + next unless $i != 0; + + #skip control instructions + my $instruct = $instructs->[$i]; + my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)}; + + # write dependencies + my $match = 0; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $capData = parseInstruct($instruct->{inst}, $gram) or next; + my (@dest, @src); + + # copy over instruction types for easier access + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + + # A predicate prefix is treated as a source reg + push @src, $instruct->{predReg} if $instruct->{pred}; + + # Handle P2R and R2P specially + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src; + my $mask = hex($capData->{i20w7}); + foreach my $p (0..6) + { + if ($mask & (1 << $p)) + { + push @$list, "P$p"; + } + # make this instruction dependent on any predicates it's not setting + # this is to prevent a race condition for any predicate sets that are pending + elsif ($instruct->{op} eq 'R2P') + { + push @src, "P$p"; + } + } + # These instructions can't be dual issued + $instruct->{nodual} = 1; + } + # Populate our register source and destination lists, skipping any zero or true values + foreach my $operand (grep { exists $regops{$_} } sort keys %$capData) + { + # figure out which list to populate + my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src; + + # Filter out RZ and PT + my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT'; + + if ($capData->{$operand} ne $badVal) + { + # add the value to list with the correct prefix + push @$list, + $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) : + $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) : + $operand eq 'CC' ? 'CC' : + $operand eq 'X' ? 'CC' : + getRegNum($regMap, $capData->{$operand}); + } + } + + # Find Read-After-Write dependencies + foreach my $src (grep { exists $deps{$_} } @src) + { + # the parent should be the most recently added dest op to the stack + foreach my $parent (@{$deps{$src}}) + { + # add this instruction as a child of the parent + # set the edge to the total latency of reg source availability + #print "R $parent->{inst}\n\t\t$instruct->{inst}\n"; + my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat}; + # update weights + my $find = 0; + foreach my $child (@{$parent->{children}}) + { + my $ins = $instructs->[$child->[0]]; + my $weight = $child->[1]; + if ($ins eq $instruct) + { + $child->[1] = $weight > $latency ? $weight : $latency; + $find = 1; + last; + } + } + # parent and child does not has efficiency dependency + if ($find == 0) + { + push @{$parent->{children}}, [$i, $latency]; + } + $instruct->{parents}++; + + # if the destination was conditionally executed, we also need to keep going back till it wasn't + last unless $parent->{pred}; + } + } + + # For a dest reg, push it onto the write stack + unshift @{$deps{$_}}, $instruct foreach @dest; + + $match = 1; + last; + } + + die "Unable to recognize instruction: $instruct->{inst}\n" unless $match; + } +} + +sub ConstructEfficiencyDAG +{ + my ($effInstructs, $typeInstructs, $types) = @_; + + foreach my $i (0 .. $#$effInstructs) + { + my $instruct = $effInstructs->[$i]; + my $typeInstruct = $typeInstructs->[$i]; + + foreach my $child (@{$instruct->{children}}) + { + my $instructType = $child->[2]; + + my $find = 0; + my $weight = 0; + foreach my $type (@$types) + { + if ($instructType eq $type) + { + $weight = $child->[1]; + } + } + push @{$typeInstruct->{children}}, [$child->[0], $weight]; + } + } +} + +sub CalculateBcomp +{ + my ($instructs) = @_; + # Bcomp + my $unitsSum = 0; + my $unitsUse = 0; + + foreach my $i (0 .. $#$instructs) + { + next unless $i != 0; + + my $instruct = $instructs->[$i]; + my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + my $dispatches = 1; + my $instructType = $gram->{type}; + if ($instructType->{class} eq 'x32' || $instructType->{class} eq 's2r' || $instructType->{class} eq 'qtr' || + $instructType->{class} eq 'rro' || $instructType->{class} eq 'vote') { + $unitsSum = $unitsSum + $instructType->{units}; + $unitsUse = $unitsUse + $dispatches * $warpSize + } + } + } + print "Bcomp: ", $unitsSum > 0 ? 1.0 - $unitsUse / $unitsSum : 0, "\n"; +} + +sub CalculateBmem +{ + my ($instructs) = @_; + # Bmem + my $sharedWidthSum = 0; + my $sharedWidthUse = 0; + my $globalWidthSum = 0; + my $globalWidthUse = 0; + foreach my $i (0 .. $#$instructs) + { + next unless $i != 0; + + my $instruct = $instructs->[$i]; + my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + my $capData = parseInstruct($inst, $gram) or next; + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + my $dispatches = 1; + my $instructType = $gram->{type}; + if ($instructType->{class} eq 'mem') { + my $memType = $capData->{type}; + # default 32 bit + my $insWidth = 4; + # vector instruction + if ($memType =~ s/^\.//g) { + $insWidth = $memType / 8; + } + if ($instructType->{type} eq 'global') { + $globalWidthSum = $globalWidthSum + 16 * $warpSize; # LDG.128 + #TODO cache + if ($op eq 'LDG') { + $globalWidthUse = $globalWidthUse + $insWidth * $warpSize; + } else { + $globalWidthUse = $globalWidthUse + ($insWidth / ceil($insWidth / 4)) * $warpSize; + } + } else { #shared + $sharedWidthSum = $sharedWidthSum + $bankWidth * $warpSize; + $sharedWidthUse = $sharedWidthUse + ($insWidth / ceil($insWidth / $bankWidth)) * $warpSize; + } + } + } + } + print "Bshared: ", $sharedWidthSum > 0 ? 1.0 - $sharedWidthUse / $sharedWidthSum : 0, "\n"; + print "Bglobal: ", $globalWidthSum > 0 ? 1.0 - $globalWidthUse / $globalWidthSum : 0, "\n"; +} + +sub CalculateBilp +{ + # efficiency dependencies for each unit + # TODO(keren): analyze more units + my ($effInstructs, $cweff) = @_; + + my @x32type = ('s2r', 'x32', 'shift', 'cmp', 'vote'); + my @x64type = ('x64'); + my @sptype = ('qtr', 'rro'); + my @memtype = ('mem'); + + my @x32Instructs; + my @x64Instructs; + my @spInstructs; + my @memInstructs; + + foreach my $i (0 .. $#$effInstructs) + { + push @x32Instructs, {}; + push @x64Instructs, {}; + push @spInstructs, {}; + push @memInstructs, {}; + } + + ConstructEfficiencyDAG($effInstructs, \@x32Instructs, \@x32type); + ConstructEfficiencyDAG($effInstructs, \@x64Instructs, \@x64type); + ConstructEfficiencyDAG($effInstructs, \@spInstructs, \@sptype); + ConstructEfficiencyDAG($effInstructs, \@memInstructs, \@memtype); + + my $cx32eff = LongestPath(\@x32Instructs); + my $cx64eff = LongestPath(\@x64Instructs); + my $cspeff = LongestPath(\@spInstructs); + my $cmemeff = LongestPath(\@memInstructs); + my $maxeff = max($cx32eff, $cspeff, $cx64eff, $cmemeff); + #print "cx32eff: ", $cx32eff, "\n"; + #print "cx64eff: ", $cx64eff, "\n"; + #print "csp32eff: ", $cspeff, "\n"; + #print "cmemeff: ", $cmemeff, "\n"; + + print "Bilp: ", $cweff > 0 ? 1.0 - $maxeff / $cweff : 0, "\n"; +} + +# Bpipe +# push longest path +sub CalculateBpipe +{ + my ($instructs, $cweff) = @_; + + my @path; + foreach my $i (0 .. $#$instructs) + { + $path[$i] = 0; + } + + foreach my $i (0 .. $#$instructs) + { + my $instruct = $instructs->[$i]; + foreach my $child (@{$instruct->{children}}) + { + my $iChild= $child->[0]; + my $weight = $child->[1]; + if ($weight + $path[$i] > $path[$iChild]) + { + $path[$iChild] = $weight + $path[$i]; + my $ins = $instructs->[$iChild]; + $ins->{prev} = {prevInstruct=>$instruct, prevWeight=>$weight}; + } + } + } + + my $longestPath = 0; + foreach my $i (0 .. $#$instructs) + { + $longestPath = $path[$i] if $path[$i] > $longestPath; + } + + my $longestLatency = 0; + foreach my $i (0 .. $#$instructs) + { + my $instruct = $instructs->[$i]; + my $latencies = 0; + if ($path[$i] == $longestPath) + { + while (defined($instruct->{prev})) + { + my $prevIns = $instruct->{prev}->{prevInstruct}; + my $prevWeight = $instruct->{prev}->{prevWeight}; + my $prevLat = $prevIns->{lat}; + if ($prevLat == $prevWeight) + { + $latencies = $latencies + $prevWeight; + } + $instruct = $prevIns; + } + } + $longestLatency = $latencies if $latencies > $longestLatency; + } + my $eff = $cweff * $activeWarp / $scheduler; + print "Bpipe: ", $eff > 0 ? $longestLatency / $eff : 0, "\n"; +} + +sub Analyze +{ + # 1. Read two files, architecture configurations and software resource usage + # 2. Output each instruction, and its efficiency + # 3. Identify the critical path + # 4. Compute bottlenecks + my ($file, $include) = @_; + + my $regMap = {}; + $file = Preprocess($file, $include, 0, $regMap, 1); + + # Extract analyze block + my @analyzeBlocks = $file =~ /$AnalyzeRe/g; + + # Iterate over analyz blocks + foreach my $i (0 .. $#analyzeBlocks) + { + print "Analyze block $i\n\n"; + + # Preprocess instructs + my @instructs = PreprocessBlock($analyzeBlocks[$i]); + + # Calculate each instruction's efficiency + CalculateEfficiency(\@instructs); + + # Analyze DAG dependencies + # Init eff instructs + my @effInstructs; + foreach my $ins (@instructs) + { + push @effInstructs, {}; + } + AnalyzeDAG(\@instructs, \@effInstructs, $regMap); + + # calculate longest path + my $predictedCycle = LongestPath(\@instructs); + print "predict cycles $predictedCycle\n"; + + ## bottleneck analyze + CalculateBcomp(\@instructs); + CalculateBmem(\@instructs); + my $cweff = LongestPath(\@effInstructs); + CalculateBilp(\@effInstructs, $cweff); + CalculateBpipe(\@instructs, $cweff); + + print "\n\n"; + } + + print "End analyze\n"; +} + +# Preprocess and Assemble a source file +sub Assemble +{ + my ($file, $include, $doReuse, $nowarn) = @_; + + my $regMap = {}; + $file = Preprocess($file, $include, 0, $regMap, 0); + my $vectors = delete $regMap->{__vectors}; + my $regBank = delete $regMap->{__regbank}; + + # initialize cubin counts + my $regCnt = 0; + my $barCnt = 0; + + my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse); + + # initialize the first control instruction + push @instructs, $ctrl = {}; + + foreach my $line (split "\n", $file) + { + # keep track of line nums in the physical file + $lineNum++; + + next unless preProcessLine($line); + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # Save us from crashing the display driver + die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n" + if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0; + + # track branches/jumps/calls/etc for label remapping + push @branches, @instructs+0 if exists $jumpOp{$inst->{op}}; + + # push the control code onto the control instruction + push @{$ctrl->{ctrl}}, $inst->{ctrl}; + + # now point the instruction to its associated control instruction + $inst->{ctrl} = $ctrl; + + # add the op name and full instruction text + push @instructs, $inst; + + # add a 4th control instruction for every 3 instructions + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + # map the label name to the index of the instruction about to be inserted + $labels{$1} = @instructs+0; + } + else + { + die "badly formed line at $lineNum: $line\n"; + } + } + # add the final BRA op and align the number of instructions to a multiple of 8 + push @{$ctrl->{ctrl}}, 0x007ff; + push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' }; + while (@instructs & 7) + { + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + push @{$ctrl->{ctrl}}, 0x007e0; + push @instructs, { op => 'NOP', inst => 'NOP;' }; + } + + # remap labels + foreach my $i (@branches) + { + if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1}) + { die "instruction has invalid label: $instructs[$i]{inst}"; } + + $instructs[$i]{jump} = $labels{$1}; + + if (exists $relOffset{$instructs[$i]{op}}) + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; } + else + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; } + } + + # calculate optimal register reuse + # This effects register bank decisions so do it before analyzing register use + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + if ($doReuse) + { + # get any vector registers for r0 + my @r0 = getVecRegisters($vectors, $capData); + + # There are 2 reuse slots per register slot + # The reuse hash points to most recent instruction index where register was last used in this slot + + # For writes to a register, clear any reuse opportunity + if (@r0 && !exists $noDest{$op}) + { + foreach my $slot (keys %reuseSlots) + { + if (my $reuse = $reuse{$slot}) + { + # if writing with a vector op, clear all linked registers + delete $reuse->{$_} foreach @r0; + } + } + } + # clear cache if jumping elsewhere + %reuse = () if exists $jumpOp{$op}; + + # only track register reuse for instruction types this works with + if ($gram->{type}{reuse}) + { + foreach my $slot (keys %reuseSlots) + { + next unless exists $capData->{$slot}; + + my $r = $capData->{$slot}; + next if $r eq 'RZ'; + next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction + + my $reuse = $reuse{$slot} ||= {}; + + # if this register was previously marked for potential reuse + if (my $p = $reuse->{$r}) + { + # flag the previous instruction's ctrl reuse array slot + $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot}; + + #print "reuse $slot $r $instructs[$p]{inst}\n"; + } + # list full, delete the oldest + elsif (keys %$reuse > 2) + { + my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0]; + delete $reuse->{$oldest}; + } + # mark the new instruction for potential reuse + $reuse->{$r} = $i; + } + } + } + # if reuse is disabled then pull value from code. + elsif ($gram->{type}{reuse}) + { + $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData); + } + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # Assign registers to requested banks if possible + foreach my $r (sort keys %$regBank) + { + my $bank = $regBank->{$r}; + my $avail = $regMap->{$r}; + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + last; + } + } + } + + # calculate register live times and preferred banks for non-fixed registers. + # LiveTime only half implemented... + my (%liveTime, %pairedBanks, %reuseHistory); + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + my $reuseType = $gram->{type}{reuse}; + + # liveTimes and bank conflicts with source operands + my (%addReuse, %delReuse); + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r}; + + # All registers should be written prior to being read.. + if (my $liveTime = $liveTime{$liveR}) + { + # for each read set the current instruction index as the high value + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + + # Is this register active in the reuse cache? + my $slotHist = $reuseHistory{$slot} ||= {}; + my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0; + + #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3'; + + # If this is an auto reg, look at the open banks. + # No need to look at banks if this register is in the reuse cache. + if (!$selfReuse && ref $regMap->{$r}) + { + # Look at other source operands in this instruction and flag what banks are being used + foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39)) + { + my $r2 = $capData->{$slot2}; + next if $r2 eq 'RZ' || $r2 eq $r; + + my $slotHist2 = $reuseHistory{$slot2} ||= {}; + + #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3'; + + # Dont be concerned with non-reuse type instructions or + # If this operand is in the reuse cache, we don't care what bank it's on. + if (!$reuseType || !exists $slotHist2->{$r2}) + { + # if the operand is also an auto-allocated register then link them + # Once we choose the bank for one we want to update that choice for the other register. + if (ref $regMap->{$r2}) + { + push @{$pairedBanks{$r}{pairs}}, $r2; + $pairedBanks{$r}{banks} ||= []; + } + # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid. + else + { + my $bank = substr($regMap->{$r2},1) & 3; + #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3'; + + $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++; + $pairedBanks{$r}{pairs} ||= []; + } + # Update the total use count for this register. + # This will be the number of times the register is pulled out of the bank. + $pairedBanks{$r}{useCnt}++; + } + } + } + # update the reuse history so we know which bank conflicts we can ignore. + if ($reuseType) + { + # flag these slots for addition or removal from reuseHistory + if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot}) + { $addReuse{$slot} = $r; } + else + { $delReuse{$slot} = $r; } + } + } + # update reuse history after we're done with the instruction (when the flag is actually in effect). + # we don't want to updated it in the middle since that can interfere with the checks, + $reuseHistory{$_}{$addReuse{$_}} = 1 foreach keys %addReuse; + delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse; + + # liveTimes for destination operands and vector registers + foreach my $r0 (getVecRegisters($vectors, $capData)) + { + # fixed register mappings can have aliases so use the actual register value for those. + my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0}; + + # If not writing treat just like a read + if (exists $noDest{$op}) + { + if (my $liveTime = $liveTime{$liveR}) + { + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r0): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + } + # If writing, push a new bracket on this register's stack. + elsif (my $liveTime = $liveTime{$liveR}) + { + if ($i > $liveTime->[$#$liveTime][1]) + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + else + { + # Initialize the liveTime stack for this register. + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + #print Dumper(\%liveTime); exit(1); + + # assign unassigned registers + # sort by most restricted, then most used, then name + foreach my $r (sort { + $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} || + $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} || + $a cmp $b + } keys %pairedBanks) + { + my $banks = $pairedBanks{$r}{banks}; + my $avail = $regMap->{$r}; + + #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail; + + # Pick a bank with zero or the smallest number of conflicts + BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3)) + { + # pick an available register that matches the requested bank + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + + # update bank info for any unassigned pair + $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}}; + last BANK; + } + } + } + } + # Now assign any remaining to first available + foreach my $r (sort keys %$regMap) + { + if (ref($regMap->{$r}) eq 'ARRAY') + { + $regMap->{$r} = 'R' . shift @{$regMap->{$r}}; + } + } + #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap; + + # apply the register mapping and assemble the instructions to op codes + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + # save the original and replace the register names with numbers + $instructs[$i]{orig} = $instructs[$i]{inst}; + $instructs[$i]{inst} =~ s/(?{$1}) ? $regMap->{$1} : $1 /ge; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + # update the register count + foreach my $r (qw(r0 r8 r20 r39)) + { + next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ'; + + # get numeric portion of regname + my $val = substr $capData->{$r}, 1; + + my @r0 = getVecRegisters($vectors, $capData); + my @r8 = getAddrVecRegisters($vectors, $capData); + + # smart enough to count vector registers for memory instructions. + my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1; + my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1; + + if ($val + $regInc > $regCnt) + { + $regCnt = $val + $regInc; + #print "$val $regCnt $regInc\n"; + } + } + # update the barrier resource count + if ($op eq 'BAR') + { + if (exists $capData->{i8w4}) + { + $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt; + } + # if a barrier value is a register, assume the maximum + elsif (exists $capData->{r8}) + { + $barCnt = 16; + } + } + # Generate the op code. + my ($code, $reuse) = genCode($op, $gram, $capData); + $instructs[$i]{code} = $code; + + # cache this for final pass when we want to calculate reuse stats. + if ($gram->{type}{reuse}) + { $instructs[$i]{caps} = $capData; } + # use the parsed value of reuse for non-reuse type instructions + else + { $ctrl->{reuse}[($i & 3) - 1] = $reuse; } + + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # final pass to piece together control codes + my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed); + foreach my $i (0 .. $#instructs) + { + # op code + if ($i & 3) + { + push @codes, $instructs[$i]{code}; + + if ($instructs[$i]{caps}) + { + # calculate stats on registers + registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn); + } + if ($instructs[$i]{inst} =~ m'EXIT') + { + push @exitOffsets, (scalar(@codes)-1)*8; + } + elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)') + { + push @ctaidOffsets, (scalar(@codes)-1)*8; + $ctaidzUsed = 1 if $1 eq 'Z'; + } + } + # control code + else + { + my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)}; + push @codes, + ($ctrl->[0] << 0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes + ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59); # reuse codes + } + } + + # return the kernel data + return { + RegCnt => $regCnt, + BarCnt => $barCnt, + ExitOffsets => \@exitOffsets, + CTAIDOffsets => \@ctaidOffsets, + CTAIDZUsed => $ctaidzUsed, + ConflictCnt => $reuseHistory{conflicts}, + ReuseCnt => $reuseHistory{reuse}, + ReuseTot => $reuseHistory{total}, + ReusePct => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0), + KernelData => \@codes, + }; +} + +# Useful for testing op code coverage of existing code, extracting new codes and flags +sub Test +{ + my ($fh, $printConflicts, $all) = @_; + + my @instructs; + my %reuseHistory; + my ($pass, $fail) = (0,0); + + while (my $line = <$fh>) + { + my (@ctrl, @reuse); + + next unless processSassCtrlLine($line, \@ctrl, \@reuse); + + foreach my $fileReuse (@reuse) + { + $line = <$fh>; + + my $inst = processSassLine($line) or next; + + $inst->{reuse} = $fileReuse; + my $fileCode = $inst->{code}; + + if (exists $relOffset{$inst->{op}}) + { + # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump + $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e; + } + + my $match = 0; + foreach my $gram (@{$grammar{$inst->{op}}}) + { + my $capData = parseInstruct($inst->{inst}, $gram) or next; + my @caps; + + # Run in test mode to list what capture groups were captured + my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps); + + # Detect register bank conflicts but only for reuse type instructions. + # If a bank conflict is avoided by a reuse flag then ignore it. + registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse}; + + $inst->{caps} = join ', ', sort @caps; + $inst->{codeDiff} = $fileCode ^ $code; + $inst->{reuseDiff} = $fileReuse ^ $reuse; + + # compare calculated and file values + if ($code == $fileCode && $reuse == $fileReuse) + { + $inst->{grade} = 'PASS'; + push @instructs, $inst if $all; + $pass++; + } + else + { + $inst->{grade} = 'FAIL'; + push @instructs, $inst; + $fail++; + } + $match = 1; + last; + } + unless ($match) + { + $inst->{grade} = 'FAIL'; + $inst->{codeDiff} = $fileCode; + $inst->{reuseDiff} = $fileReuse; + push @instructs, $inst; + $fail++; + } + } + } + my %maxLen; + foreach (@instructs) + { + $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}}; + } + my ($lastOp, $template); + foreach my $inst (sort { + $a->{op} cmp $b->{op} || + $a->{codeDiff} <=> $b->{codeDiff} || + $a->{reuseDiff} <=> $b->{reuseDiff} || + $a->{ins} cmp $b->{ins} + } @instructs) + { + if ($lastOp ne $inst->{op}) + { + $lastOp = $inst->{op}; + $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s %s\n"; + printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures); + } + printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)}; + } + my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0; + + printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n", + $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total}; + + return $fail; +} + +# Convert cuobjdump sass to the working format +sub Extract +{ + my ($in, $out, $params) = @_; + + my %paramMap; + my %constants = + ( + blockDimX => 'c[0x0][0x8]', + blockDimY => 'c[0x0][0xc]', + blockDimZ => 'c[0x0][0x10]', + gridDimX => 'c[0x0][0x14]', + gridDimY => 'c[0x0][0x18]', + gridDimZ => 'c[0x0][0x1c]', + ); + print $out "\n"; + + foreach my $const (sort keys %constants) + { + print $out " $const : $constants{$const}\n"; + $paramMap{$constants{$const}} = $const; + } + print $out "\n"; + + foreach my $p (@$params) + { + my ($ord,$offset,$size,$align) = split ':', $p; + + if ($size > 4) + { + my $num = 0; + $offset = hex $offset; + while ($size > 0) + { + my $param = sprintf 'param_%d[%d]', $ord, $num; + my $const = sprintf 'c[0x0][0x%x]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + $size -= 4; + $offset += 4; + $num += 1; + } + } + else + { + my $param = sprintf 'param_%d', $ord; + my $const = sprintf 'c[0x0][%s]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + } + } + print $out "\n\n"; + + my %labels; + my $labelnum = 1; + + my @data; + FILE: while (my $line = <$in>) + { + my (@ctrl, @ruse); + next unless processSassCtrlLine($line, \@ctrl, \@ruse); + + CTRL: foreach my $ctrl (@ctrl) + { + $line = <$in>; + + my $inst = processSassLine($line) or next CTRL; + + # Convert branch/jump/call addresses to labels + if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)') + { + my $target = hex($1); + + # skip the final BRA and stop processing the file + last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8); + + # check to see if we've already generated a label for this target address + my $label = $labels{$target}; + unless ($label) + { + # generate a label name and cache it + $label = $labels{$target} = "TARGET$labelnum"; + $labelnum++; + } + # replace address with name + $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/; + } + $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg; + + $inst->{ctrl} = printCtrl($ctrl); + + push @data, $inst; + } + } + # make a second pass now that we have the complete instruction address to label mapping + foreach my $inst (@data) + { + print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}}; + printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)}; + } +} + +my $CommentRe = qr'^[\t ]*.*?^\s*\n?'ms; +my $IncludeRe = qr'^[\t ]*\n?'ms; +my $CodeRe = qr'^[\t ]*(.*?)^\s*<\/CODE\1>\n?'ms; +my $ConstMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $RegMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $ScheduleRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $InlineRe = qr'\[(\+|\-)(.+?)\1\]'ms; + +sub IncludeFile +{ + my ($file, $include) = @_; + my ($vol,$dir,$name) = File::Spec->splitpath($file); + local $/; + my $fh; + if (!open $fh, $file) + { + open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n"; + } + my $content = <$fh>; + close $fh; + return $content; +} + +sub Preprocess +{ + my ($file, $include, $debug, $regMap, $doAnalyze) = @_; + + my $constMap = {}; + my $removeRegMap; + if ($regMap) + { $removeRegMap = 1; } + else + { $regMap = {}; } + + # include nested files + 1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg; + + # Strip out comments + $file =~ s|$CommentRe||g; + + # Execute the CODE sections (old way to run code, to be deprecated) + 1 while $file =~ s|$CodeRe| + my $out = eval "package MaxAs::MaxAs::CODE; $2"; + $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg; + + # Execute the inline code (new way) + $file =~ s|$InlineRe| + my ($type, $code) = ($1, $2); + my $out = eval "package MaxAs::MaxAs::CODE; $code"; + $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg; + + #Pull in the constMap + $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg; + + my @newFile; + foreach my $line (split "\n", $file) + { + # skip comments + if ($line !~ m'^\s*(?:#|//).*') + { + $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg; + } + push @newFile, $line; + } + $file = join "\n", @newFile; + + # Pull in the reg map first as the Scheduler will need it to handle vector instructions + # Remove the regmap if we're going on to assemble + $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg; + + # Pick out the SCHEDULE_BLOCK sections + my @schedBlocks = $file =~ /$ScheduleRe/g; + + # Schedule them + foreach my $i (0 .. $#schedBlocks) + { + # XMAD macros should only appear in SCHEDULE_BLOCKs + $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]); + + $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug); + } + + # Replace the results + $file =~ s|$ScheduleRe| shift @schedBlocks |eg; + + # Strip out analyzeBlocks + $file =~ s|$AnalyzeRe||eg if not $doAnalyze; + + return $file; +} + +sub Scheduler +{ + my ($block, $blockNum, $regMap, $debug) = @_; + + my $vectors = $regMap->{__vectors}; + my $lineNum = 0; + + my (@instructs, @comments, $ordered, $first); + foreach my $line (split "\n", $block) + { + # keep track of line nums in the physical file + $lineNum++; + + unless (preProcessLine($line)) + { + push @comments, $line if $line =~ m'\S'; + next; + } + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # if the first instruction in the block is waiting on a dep, it should go first. + $inst->{first} = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1; + + # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block) + #$inst->{first} = $inst->{ctrl} & 0x0000f ? 1 : 2; + $inst->{exeTime} = 0; + $inst->{order} = $ordered++ if $ordered; + $inst->{force_stall} = $inst->{ctrl} & 0xf if $inst->{comment} =~ m'FORCE'; + + push @instructs, $inst; + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n"; + } + # open an ORDERED block + elsif ($line =~ m'^') + { + die "you cannot use nested tags" if $ordered; + $ordered = 1; + } + # close an ORDERED block + elsif ($line =~ m'^') + { + die "missing opening for closing tag" if !$ordered; + $ordered = 0; + } + else + { + die "badly formed line at block: $blockNum line: $lineNum: $line\n"; + } + } + my (%writes, %reads, @ready, @schedule, $orderedParent); + # assemble the instructions to op codes + foreach my $instruct (@instructs) + { + my $match = 0; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $capData = parseInstruct($instruct->{inst}, $gram) or next; + my (@dest, @src); + + # copy over instruction types for easier access + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + + $instruct->{dualCnt} = $instruct->{dual} ? 1 : 0; + + # A predicate prefix is treated as a source reg + push @src, $instruct->{predReg} if $instruct->{pred}; + + # Handle P2R and R2P specially + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src; + my $mask = hex($capData->{i20w7}); + foreach my $p (0..6) + { + if ($mask & (1 << $p)) + { + push @$list, "P$p"; + } + # make this instruction dependent on any predicates it's not setting + # this is to prevent a race condition for any predicate sets that are pending + elsif ($instruct->{op} eq 'R2P') + { + push @src, "P$p"; + } + } + # These instructions can't be dual issued + $instruct->{nodual} = 1; + } + + # Populate our register source and destination lists, skipping any zero or true values + foreach my $operand (grep { exists $regops{$_} } sort keys %$capData) + { + # figure out which list to populate + my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src; + + # Filter out RZ and PT + my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT'; + + if ($capData->{$operand} ne $badVal) + { + # add the value to list with the correct prefix + push @$list, + $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) : + $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) : + $operand eq 'CC' ? 'CC' : + $operand eq 'X' ? 'CC' : + getRegNum($regMap, $capData->{$operand}); + } + } + $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39}); + + # Find Read-After-Write dependencies + foreach my $src (grep { exists $writes{$_} } @src) + { + # Memory operations get delayed access to registers but not to the predicate + my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat}; + + # the parent should be the most recently added dest op to the stack + foreach my $parent (@{$writes{$src}}) + { + # add this instruction as a child of the parent + # set the edge to the total latency of reg source availability + #print "R $parent->{inst}\n\t\t$instruct->{inst}\n"; + my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat}; + push @{$parent->{children}}, [$instruct, $latency - $regLatency]; + $instruct->{parents}++; + + # if the destination was conditionally executed, we also need to keep going back till it wasn't + last unless $parent->{pred}; + } + } + + # Find Write-After-Read dependencies + foreach my $dest (grep { exists $reads{$_} } @dest) + { + # Flag this instruction as dependent to any previous read + foreach my $reader (@{$reads{$dest}}) + { + # no need to stall for these types of dependencies + #print "W $reader->{inst} \t\t\t $instruct->{inst}\n"; + push @{$reader->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + # Once dependence is marked we can clear out the read list (unless this write was conditional). + # The assumption here is that you would never want to write out a register without + # subsequently reading it in some way prior to writing it again. + delete $reads{$dest} unless $instruct->{pred}; + } + + # Enforce instruction ordering where requested + if ($instruct->{order}) + { + if ($orderedParent && $instruct->{order} > $orderedParent->{order}) + { + push @{$orderedParent->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + $orderedParent = $instruct; + } + elsif ($orderedParent) + { $orderedParent = 0; } + + # For a dest reg, push it onto the write stack + unshift @{$writes{$_}}, $instruct foreach @dest; + + # For a src reg, push it into the read list + push @{$reads{$_}}, $instruct foreach @src; + + # if this instruction has no dependencies it's ready to go + push @ready, $instruct if !exists $instruct->{parents}; + + $match = 1; + last; + } + die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match; + } + %writes = (); + %reads = (); + + if (@ready) + { + # update dependent counts for sorting hueristic + my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" }; + + countUniqueDescendants($readyParent, {}); + updateDepCounts($readyParent, {}); + + # sort the initial ready list + @ready = sort { + $a->{first} <=> $b->{first} || + $b->{deps} <=> $a->{deps} || + $a->{dualCnt} <=> $b->{dualCnt} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall dualCnt mix deps lineNum inst)} foreach @ready; + } + } + + # Process the ready list, adding new instructions to the list as we go. + my $clock = 0; + while (my $instruct = shift @ready) + { + my $stall = $instruct->{stall}; + + # apply the stall to the previous instruction + if (@schedule && $stall < 16) + { + my $prev = $schedule[$#schedule]; + + $stall = $prev->{force_stall} if $prev->{force_stall} > $stall; + + # if stall is greater than 4 then also yield + # the yield flag is required to get stall counts 12-15 working correctly. + $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0; + $prev->{ctrl} |= $stall; + $clock += $stall; + } + # For stalls bigger than 15 we assume the user is managing it with a barrier + else + { + $instruct->{ctrl} &= 0x1fff0; + $instruct->{ctrl} |= 1; + $clock += 1; + } + print "$clock: $instruct->{inst}\n" if $debug; + + # add a new instruction to the schedule + push @schedule, $instruct; + + # update each child with a new earliest execution time + if (my $children = $instruct->{children}) + { + foreach (@$children) + { + my ($child, $latency) = @$_; + + # update the earliest clock value this child can safely execute + my $earliest = $clock + $latency; + $child->{exeTime} = $earliest if $child->{exeTime} < $earliest; + + print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug; + + # decrement parent count and add to ready queue if none remaining. + push @ready, $child if --$child->{parents} < 1; + } + delete $instruct->{children}; + } + + # update stall and mix values in the ready queue on each iteration + foreach my $ready (@ready) + { + # calculate how many instructions this would cause the just added instruction to stall. + $stall = $ready->{exeTime} - $clock; + $stall = 1 if $stall < 1; + + # if using the same compute resource as the prior instruction then limit the throughput + if ($ready->{class} eq $instruct->{class}) + { + $stall = $ready->{tput} if $stall < $ready->{tput}; + } + # dual issue with a simple instruction (tput <= 2) + # can't dual issue two instructions that both load a constant + elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} && + $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const})) + { + $stall = 0; + } + $ready->{stall} = $stall; + + # add an instruction class mixing huristic that catches anything not handled by the stall + $ready->{mix} = $ready->{class} ne $instruct->{class} || 0; + $ready->{mix} = 2 if $ready->{mix} && $ready->{op} eq 'R2P'; + } + + # sort the ready list by stall time, mixing huristic, dependencies and line number + @ready = sort { + $a->{first} <=> $b->{first} || + $a->{stall} <=> $b->{stall} || + $a->{dualCnt} <=> $b->{dualCnt} || + $b->{mix} <=> $a->{mix} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "\tf,ext,stl,duc,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall dualCnt mix deps lineNum inst)} foreach @ready; + } + + foreach my $ready (@ready) + { + $ready->{dualCnt} = 0 if $ready->{dualCnt} && $ready->{stall} == 1; + } + } + + my $out; + #$out .= "$_\n" foreach @comments; + $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule; + return $out; +} + +sub setConstMap +{ + my ($constMap, $constMapText) = @_; + + foreach my $line (split "\n", $constMapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my ($name, $value) = split '\s*:\s*', $line; + + $constMap->{$name} = $value; + } + return; +} + +sub setRegisterMap +{ + my ($regMap, $regmapText) = @_; + + my $vectors = $regMap->{__vectors} ||= {}; + my $regBank = $regMap->{__regbank} ||= {}; + my %aliases; + + foreach my $line (split "\n", $regmapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my $auto = $line =~ /~/; + my $share = $line =~ /=/; + + my ($regNums, $regNames) = split '\s*[:~=]\s*', $line; + + my (@numList, @nameList, %vecAliases); + foreach my $num (split '\s*,\s*', $regNums) + { + my ($start, $stop) = split '\s*\-\s*', $num; + die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop; + push @numList, ($start .. $stop||$start); + } + foreach my $fullName (split '\s*,\s*', $regNames) + { + if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$') + { + my ($name1, $name2, $bank) = ($1, $3, $4); + foreach (split '\s*\|\s*', $2) + { + my ($start, $stop) = split '\s*\-\s*'; + foreach my $r (map "$name1$_$name2", $start .. $stop||$start) + { + # define an alias for use in vector instructions that omits the number portion + $aliases{$r} = "$name1$name2" unless exists $aliases{$r}; + push @nameList, $r; + $regBank->{$r} = $bank if $auto && defined $bank; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank; + } + } + } + elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$') + { + push @nameList, $1; + $regBank->{$1} = $2 if $auto && defined $2; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2; + } + else + { + die "Bad register name: '$fullName' at: $line\n"; + } + } + die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList; + die "Missmatched register mapping at: $line\n" if $share && @numList > 1; + + # detect if this list is monotonically ascending with no gaps + my $i = 0; + while ($i < $#numList-1) + { + last if $numList[$i] + 1 != $numList[$i+1]; + $i++; + } + my $ascending = $i+1 == $#numList; + + foreach my $n (0..$#nameList) + { + die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]}; + + if ($auto) + { + # assign possible values to be assigned on assembly + $regMap->{$nameList[$n]} = \@numList; + } + elsif ($share) + { + # each name shares the same single register + $regMap->{$nameList[$n]} = 'R' . $numList[0]; + } + else + { + $regMap->{$nameList[$n]} = 'R' . $numList[$n]; + # flag any even register as a potential vector + if ($ascending && ($numList[$n] & 1) == 0) + { + # constrain potential range to vector alignment + my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3); + if ($end <= $#nameList) + { + $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ]; + #setup an alias for the base name without the number + if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}}) + { + $regMap->{$aliases{$nameList[$n]}} = $regMap->{$nameList[$n]}; + $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]}; + delete $aliases{$nameList[$n]}; + } + } + } + } + } + } + #print Dumper($regMap); exit(1); +} + +sub preProcessLine +{ + # strip leading space + $_[0] =~ s|^\s+||; + + # preserve comment but check for emptiness + my $val = shift; + + # strip comments + $val =~ s{(?:#|//).*}{}; + + # skip blank lines + return $val =~ m'\S'; +} + +# traverse the graph and count total descendants per node. +# only count unique nodes (by lineNum) +sub countUniqueDescendants +{ + my ($node, $edges) = @_; + + #print "P:$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + + $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges); + } + foreach my $child (grep !$_->[1], @$children) # WaR deps + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + + 1 foreach countUniqueDescendants($child->[0], $edges); + } + } + else + { + return $node->{lineNum}; + } + return ($node->{lineNum}, keys %{$node->{deps}}); +} +# convert hash to count for easier sorting. +sub updateDepCounts +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (@$children) + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + updateDepCounts($child->[0], $edges); + } + } + $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0; +} + +# Detect register bank conflicts and calculate reuse stats +sub registerHealth +{ + my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_; + + my (@banks, @conflicts); + + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $slotHist = $reuseHistory->{$slot} ||= {}; + + $reuseHistory->{total}++; + + # if this register is in active reuse then ignore for bank conflict checking. + if (exists $slotHist->{$r}) + { + $reuseHistory->{reuse}++; + } + else + { + # extract number from reg and take the modulo-4 value. This is the bank id. + my $bank = substr($r,1) & 3; + + # check for conflict + if ($banks[$bank] && $banks[$bank] ne $r) + { + push @conflicts, $banks[$bank] if !@conflicts; + push @conflicts, $r; + + $reuseHistory->{conflicts}++; + } + $banks[$bank] = $r; + } + + # update the history + if ($reuseFlags & $reuseSlots{$slot}) + { $slotHist->{$r} = 1; } + else + { delete $slotHist->{$r}; } + } + if ($inst && @conflicts && !$nowarn) + { + printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts); + } + return scalar @conflicts; +} + +1; + +__END__ + +=head1 NAME + +MaxAs::MaxAs - Assembler for NVIDIA Maxwell architecture + +=head1 SYNOPSIS + + maxas.pl [opts] + +=head1 DESCRIPTION + +See the documentation at: https://github.com/NervanaSystems/maxas + +=head1 SEE ALSO + +See the documentation at: https://github.com/NervanaSystems/maxas + + +=head1 AUTHOR + +Scott Gray, Esgray@nervanasys.com + +=head1 COPYRIGHT AND LICENSE + +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +=cut diff --git a/Assembler/MaxAs/lib/MaxAs/MaxAsGrammar.pm b/Assembler/MaxAs/lib/MaxAs/MaxAsGrammar.pm new file mode 100644 index 0000000..2298442 --- /dev/null +++ b/Assembler/MaxAs/lib/MaxAs/MaxAsGrammar.pm @@ -0,0 +1,1478 @@ +package MaxAs::MaxAsGrammar; + +use strict; +use Carp; +use Exporter; +use Data::Dumper; +our @ISA = qw(Exporter); + +our @EXPORT = qw( + %grammar %flags + parseInstruct genCode genReuseCode + processAsmLine processSassLine processSassCtrlLine + replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters +); + +require 5.10.0; + +# Helper functions for operands +sub getI +{ + my ($orig, $pos, $mask) = @_; + my $val = $orig; + my $neg = $val =~ s|^\-||; + + # parse out our custom index immediates for addresses + if ($val =~ m'^(\d+)[xX]<([^>]+)>') + { + # allow any perl expression and multiply result by leading decimal. + # also allow global scalar varibles in the expression. + my $mul = $1; + my $exp = $2; + # strip leading zeros (don't interpret numbers as octal) + $exp =~ s/(?> $trunc) & 0x7ffff if $trunc; + } + return $val << $pos; +} +sub getR +{ + my ($val, $pos) = @_; + if ($val =~ m'^R(\d+|Z)$' && $1 < 255) + { + $val = $1 eq 'Z' ? 0xff : $1; + } + else + { + die "Bad register name found: $val\n"; + } + return $val << $pos; +} +sub getP +{ + my ($val, $pos) = @_; + if ($val =~ m'^P(\d|T)$' && $1 < 7) + { + $val = $1 eq 'T' ? 7 : $1; + } + else + { + die "Bad predicate name found: $val\n"; + } + return $val << $pos; +} +sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 } + +# Map operands into their value and position in the op code. +my %operands = +( + p0 => sub { getP($_[0], 0) }, + p3 => sub { getP($_[0], 3) }, + p12 => sub { getP($_[0], 12) }, + p29 => sub { getP($_[0], 29) }, + p39 => sub { getP($_[0], 39) }, + p45 => sub { getP($_[0], 45) }, + p48 => sub { getP($_[0], 48) }, + p58 => sub { getP($_[0], 58) }, + r0 => sub { getR($_[0], 0) }, + r8 => sub { getR($_[0], 8) }, + r20 => sub { getR($_[0], 20) }, + r28 => sub { getR($_[0], 28) }, + r39s20 => sub { getR($_[0], 39) }, + r39 => sub { getR($_[0], 39) }, + r39a => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20 + c20 => sub { getC($_[0]) }, + c39 => sub { getC($_[0]) }, + c34 => sub { hex($_[0]) << 34 }, + c36 => sub { hex($_[0]) << 36 }, + f20w32 => sub { getF($_[0], 20, 'f') }, + f20 => sub { getF($_[0], 20, 'f', 12) }, + d20 => sub { getF($_[0], 20, 'd', 44) }, + i8w4 => sub { getI($_[0], 8, 0xf) }, + i20 => sub { getI($_[0], 20, 0x7ffff) }, + i20w6 => sub { getI($_[0], 20, 0x3f) }, + i20w7 => sub { getI($_[0], 20, 0x7f) }, + i20w8 => sub { getI($_[0], 20, 0xff) }, + i20w12 => sub { getI($_[0], 20, 0xfff) }, + i20w24 => sub { getI($_[0], 20, 0xffffff) }, + i20w32 => sub { getI($_[0], 20, 0xffffffff) }, + i31w4 => sub { getI($_[0], 31, 0xf) }, + i34w13 => sub { getI($_[0], 34, 0x1fff) }, + i36w20 => sub { getI($_[0], 36, 0xfffff) }, + i39w8 => sub { getI($_[0], 39, 0xff) }, + i28w8 => sub { getI($_[0], 28, 0xff) }, + i28w20 => sub { getI($_[0], 28, 0xfffff) }, + i48w8 => sub { getI($_[0], 48, 0xff) }, + i51w5 => sub { getI($_[0], 51, 0x1f) }, + i53w5 => sub { getI($_[0], 53, 0x1f) }, +); + +# Rules for operands and their closely tied flags +my $hex = qr"0[xX][0-9a-fA-F]+"; +my $iAddr = qr"\d+[xX]<[^>]+>"; +my $immed = qr"$hex|$iAddr|\d+"o; +my $reg = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\ +my $p = qr"P[0-6T]"; +my $noPred = qr"(?)"; +my $pred = qr"\@(?\!)?P(?[0-6]) "; +my $p0 = qr"(?$p)"o; +my $p3 = qr"(?$p)"o; +my $p12 = qr"(?\!)?(?$p)"o; +my $p29 = qr"(?\!)?(?$p)"o; +my $p39 = qr"(?\!)?(?$p)"o; +my $p45 = qr"(?$p)"o; +my $p48 = qr"(?$p)"o; +my $p58 = qr"(?$p)"o; +my $r0 = qr"(?$reg)"; +my $r0cc = qr"(?$reg)(?\.CC)?"; +my $r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1|F32))?(?\.reuse)?"; +my $r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r28 = qr"(?$reg)"; +my $r39s20 = qr"(?\-)?(?\|)?(?(?$reg))\|?(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39 = qr"(?\-)?(?$reg)(?:\.(?H0|H1|H0_H0|H1_H1|F32))?(?\.reuse)?"; +my $r39a = qr"(?(?$reg))(?\.reuse)?"; +my $c20 = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20x = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20s39 = qr"(?\-)?c\[(?$hex)\]\s*\[(?$hex)\]"o; +my $f20w32 = qr"(?(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))"; +my $f20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $d20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $i8w4 = qr"(?$immed)"o; +my $i20 = qr"(?(?\-)?$immed)(?\.NEG)?"o; +my $i20w6 = qr"(?$immed)"o; +my $i20w7 = qr"(?$immed)"o; +my $i20w8 = qr"(?$immed)"o; +my $i20w12 = qr"(?$immed)"o; +my $i20w24 = qr"(?\-?$immed)"o; +my $i20w32 = qr"(?\-?$immed)"o; +my $i39w8 = qr"(?\-?$immed)"o; +my $i28w8 = qr"(?$immed)"o; +my $i28w20 = qr"(?\-?$immed)"o; +my $i31w4 = qr"(?$immed)"o; +my $i34w13 = qr"(?$immed)"o; +my $i36w20 = qr"(?$immed)"o; +my $i48w8 = qr"(?$immed)"o; +my $i51w5 = qr"(?$immed)"o; +my $i53w5 = qr"(?$immed)"o; +my $ir20 = qr"$i20|$r20"o; +my $cr20 = qr"$c20|$r20"o; +my $icr20 = qr"$i20|$c20|$r20"o; +my $fcr20 = qr"$f20|$c20|$r20"o; +my $cr39 = qr"$c20s39|$r39"o; +my $dr20 = qr"$d20|$r20"o; + +# Instruction specific rules for capturing various flags +my $u32 = qr"(?\.U32)?"; +my $ftz = qr"(?\.FTZ)?"; +my $fmz = qr"(?\.FMZ)?"; +my $sat = qr"(?\.SAT)?"; +my $rnd = qr"(?:\.(?RN|RM|RP|RZ))?"; +my $round = qr"(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?"; +my $fcmp = qr"(?\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)"; +my $icmp = qr"\.(?LT|EQ|LE|GT|NE|GE)"; +my $bool = qr"\.(?AND|OR|XOR|PASS_B)"; +my $bool2 = qr"\.(?AND|OR|XOR)"; +my $func = qr"\.(?COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)"; +my $rro = qr"\.(?SINCOS|EX2)"; +my $add3 = qr"(?:\.(?X|RS|LS))?"; +my $lopz = qr"(?:\.(?NZ|Z) $p48,|(?))"o; +my $X = qr"(?\.X)?"; +my $tld = qr"(?NODEP\.)?(?:(?T)|(?P))"; +my $chnls = qr"(?R|RGBA)"; +my $sr = qr"SR_(?\S+)"; +my $shf = qr"(?\.W)?(?:\.(?U64|S64))?(?\.HI)?"; +my $xmad = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL\.CLO|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $xmadc = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL\.CLO|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $vmad8 = qr"\.(?[SU])(?8|16)\.(?[SU])(?8|16)(?\.PO)?(?\.SHR_7)?(?\.SHR_15)?(?\.SAT)?"; +my $vmad16= qr"\.(?[SU])(?16)\.(?[SU])(?16)"; +my $hilo = qr"(?:\.(?XHI|XLO))?"; +my $vaddType = qr"(?:\.(?UD))?(?:\.(?SD))?(?:\.(?[SU])(?8|16|32))?(?:\.(?[SU])(?8|16|32))?"; +my $vaddMode = qr"(?:\.(?MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?"; +my $vmnmx = qr"(?:\.(?MX))?"; +my $x2x = qr"\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)"; +my $prmt = qr"(?:\.(?F4E|B4E|RC8|ECL|ECR|RC16))?"; +my $shfl = qr"\.(?IDX|UP|DOWN|BFLY)"; +my $bar = qr"\.(?SYNC|ARV|RED)(?:\.(?POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?()|(?))(?(), $p39|(?))"o; +my $b2r = qr"\.RESULT $r0(?:, $p45|(?))"o; +my $dbar = qr"(?SB0|SB1|SB2|SB3|SB4|SB5)"; +my $dbar2 = qr"(?5)?,?(?4)?,?(?3)?,?(?2)?,?(?1)?,?(?0)?"; +my $mbar = qr"\.(?CTA|GL|SYS)"; +my $addr = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i20w24)?\]"o; +my $addr2 = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i28w20)?\]"o; +my $ldc = qr"c\[(?$hex)\]\s*$addr"o; +my $atom = qr"(?\.E)?(?:\.(?ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)"; +my $vote = qr"\.(?ALL|ANY|EQ)"o; +my $memType = qr"(?\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)"; +my $memCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CI|CS|CV|IL|WT))?"; +my $dptype = qr"(?:\.(?U32|S32))?(?:\.(?U32|S32))?"; +my $dpmode = qr"\.(?LO|HI)"; +my $hmode = qr"(?:\.(?F32|MRG_H0|MRG_H1))?$ftz"; + +# class: hardware resource that shares characteristics with types +# lat : pipeline depth where relevent, placeholder for memory ops +# blat : barrier latency, typical fetch time for memory operations. Highly variable. +# rlat : operand read latency for memory ops +# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op. +# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession. +# dual : whether this instruction type can be dual issued +# reuse: whether this instruction type accepts register reuse flags. + +# Some of these values are guesses and need to be updated from micro benchmarks. +# We may need to split these classes up further. +# @TODO(keren): what instructions are used by SFUs +my $s2rT = {class => 's2r', lat => 2, blat => 25, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0, units => 32}; +my $smemT = {class => 'mem', lat => 6, blat => 30, rlat => 2, rhold => 20, tput => 1, dual => 1, reuse => 0, units => 8, type => 'shared'}; +my $gmemT = {class => 'mem', lat => 200, blat => 200, rlat => 4, rhold => 20, tput => 1, dual => 1, reuse => 0, units => 8, type => 'global'}; +my $x32T = {class => 'x32', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 1, units => 32}; +my $x64T = {class => 'x64', lat => 2, blat => 128, rlat => 0, rhold => 0, tput => 128, dual => 0, reuse => 1, units => 16}; +my $shftT = {class => 'shift', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1, units => 32}; +my $cmpT = {class => 'cmp', lat => 13, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1, units => 32}; +my $qtrT = {class => 'qtr', lat => 8, blat => 0, rlat => 4, rhold => 0, tput => 1, dual => 1, reuse => 0, units => 8}; +my $rroT = {class => 'rro', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0, units => 8}; +my $voteT = {class => 'vote', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0, units => 32}; + + +# Create map of op names to rules +our %grammar = +( + #Floating Point Instructions + FADD => [ { type => $x32T, code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FADD32I => [ { type => $x32T, code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o, } ], + FCHK => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o, } ], #Partial? + FCMP => [ { type => $cmpT, code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o, } ], + FFMA => [ + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o, }, + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o, }, + ], + FMNMX => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o, } ], + FMUL => [ { type => $x32T, code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FMUL32I => [ { type => $x32T, code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o, } ], + FSET => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o, } ], + FSETP => [ { type => $cmpT, code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], + MUFU => [ { type => $qtrT, code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o, } ], + RRO => [ { type => $rroT, code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o, } ], + DADD => [ { type => $x64T, code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o, } ], + DFMA => [ { type => $x64T, code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o, } ], + DMNMX => [ { type => $cmpT, code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o, } ], + DMUL => [ { type => $x64T, code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o, } ], + DSET => [ { type => $cmpT, code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o, } ], + DSETP => [ { type => $cmpT, code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o, } ], + FSWZADD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o, } ], #TODO + + HADD2 => [ { type => $x32T, code => 0x5d10000000000000, rule => qr"^$pred?HADD2$hmode$fmz$ftz$sat $r0, $r8, $r20;"o, } ], + HMUL2 => [ { type => $x32T, code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$hmode$fmz$ftz$sat $r0, $r8, $r20;"o, } ], + HFMA2 => [ { type => $x32T, code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$hmode$fmz$ftz$sat $r0, $r8, $r20, $r39;"o, } ], + HSETP2 => [ { type => $cmpT, code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial + + #Integer Instructions + BFE => [ { type => $shftT, code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o, } ], + BFI => [ { type => $shftT, code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o, } ], + FLO => [ { type => $s2rT, code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o, } ], + IADD => [ { type => $x32T, code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o, } ], + IADD32I => [ { type => $x32T, code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o, } ], + IADD3 => [ { type => $x32T, code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o, } ], + ICMP => [ { type => $cmpT, code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o, } ], + IMNMX => [ { type => $shftT, code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o, } ], + ISET => [ { type => $shftT, code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o, } ], + ISETP => [ { type => $cmpT, code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ], + ISCADD => [ { type => $shftT, code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o, } ], + ISCADD32I => [ { type => $shftT, code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o, } ], + LEA => [ + { type => $cmpT, code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o, }, + { type => $shftT, code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o, }, + { type => $shftT, code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o, }, + { type => $shftT, code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o, }, + ], + LOP => [ { type => $x32T, code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?~)?$icr20(?\.INV)?;"o, } ], + LOP32I => [ { type => $x32T, code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o, } ], + LOP3 => [ + { type => $x32T, code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o, }, + { type => $x32T, code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o, }, + ], + POPC => [ { type => $s2rT, code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o, } ], + SHF => [ + { type => $shftT, code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o, }, + { type => $shftT, code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o, }, + ], + SHL => [ { type => $shftT, code => 0x5c48000000000000, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $icr20;"o, } ], + SHR => [ { type => $shftT, code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o, } ], + XMAD => [ + { type => $x32T, code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o, }, + { type => $x32T, code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o, }, + ], + # XMAD replaces these + IMAD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o, } ], #TODO + IMADSP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO + IMUL => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o, } ], #TODO + + #Conversion Instructions + F2F => [ { type => $qtrT, code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ], + F2I => [ { type => $qtrT, code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o, } ], + I2F => [ { type => $qtrT, code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o, } ], + I2I => [ { type => $qtrT, code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o, } ], + + #Movement Instructions + MOV => [ { type => $x32T, code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o, } ], + MOV32I => [ { type => $x32T, code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o, } ], + PRMT => [ { type => $x32T, code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ], + SEL => [ { type => $x32T, code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o, } ], + SHFL => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ], + + #Predicate/CC Instructions + PSET => [ { type => $cmpT, code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o, } ], + PSETP => [ { type => $cmpT, code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ], + CSET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o, } ], #TODO + CSETP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO + P2R => [ { type => $x32T, code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o, } ], + R2P => [ { type => $shftT, code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o, } ], + + #Texture Instructions + # Handle the commonly used 1D texture functions.. but save the others for later + TLD => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial + TLDS => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial + TEX => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o, } ], #TODO + TLD4 => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o, } ], #TODO + TXQ => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o, } ], #TODO + TEXS => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o, } ], #TODO + TLD4S => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO + + #Compute Load/Store Instructions + LD => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o, } ], + ST => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o, } ], + LDG => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o, } ], + STG => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o, } ], + LDS => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o, } ], + STS => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o, } ], + LDL => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o, } ], + STL => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o, } ], + LDC => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o, } ], + # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded). + ATOM => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + ATOMS => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + RED => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o, } ], + CCTL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o, } ], #TODO + CCTLL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO + CCTLT => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO + + #Surface Memory Instructions (haven't gotten to these yet..) + SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO + SULD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o, } ], #TODO + SURED => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o, } ], #TODO + SUST => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o, } ], #TODO + + #Control Instructions + BRA => [ + { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?\.U)? $i20w24;"o, }, + { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?\.U)? CC\.EQ, $i20w24;"o, }, + ], + BRX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o, } ], #TODO + JMP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o, } ], #TODO + JMX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o, } ], #TODO + SSY => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o, } ], + SYNC => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o, } ], + CAL => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o, } ], + JCAL => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o, } ], + PRET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o, } ], #TODO + RET => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o, } ], + BRK => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o, } ], + PBK => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o, } ], + CONT => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o, } ], + PCNT => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o, } ], + EXIT => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o, } ], + PEXIT => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o, } ], #TODO + BPT => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o, } ], + + #Miscellaneous Instructions + NOP => [ { type => $x32T, code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o, } ], + CS2R => [ { type => $x32T, code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o, } ], + S2R => [ { type => $s2rT, code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o, } ], + B2R => [ { type => $x32T, code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o, } ], + BAR => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o, } ], + DEPBAR => [ + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, }, + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o, }, + ], + MEMBAR => [ { type => $x32T, code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o, } ], + VOTE => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?))$p45, $p39;"o, } ], + R2B => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o, } ], #TODO + + #Video Instructions... Need to finish + VADD => [ { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMAD => [ + { type => $x32T, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, }, + ], + VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMNMX => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + + VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + + DP4A => [ { type => $x32T, code => 0x53f8000000000000, rule => qr"^$pred?DP4A$dptype $r0, $r8, $icr20, $r39;"o, } ], + DP2A => [ { type => $x32T, code => 0x53f9000000000000, rule => qr"^$pred?DP2A$dpmode$dptype $r0, $r8, $icr20, $r39;"o, } ], +); + +# Create map of capture groups to op code flags that need to be added (or removed) +my @flags = grep /\S/, split "\n", q{; + +BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD +0x0100000000000000 neg + +FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP +0x0100000000000000 neg + +PSET, PSETP +0x0000000000008000 p12not +0x0000000100000000 p29not + +FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE +0x0000040000000000 p39not + +IADD, IADD3, XMAD, LEA, IMNMX +0x0000800000000000 CC + +IADD32I +0x0010000000000000 CC + +LEA +0x0000000000000000 X + +SHF +0x0004000000000000 W +0x0001000000000000 HI + +SHF: type +0x0000004000000000 U64 +0x0000006000000000 S64 + +SHR, IMNMX, ISETP, ISET, ICMP, BFE +0x0001000000000000 U32 + +SHL +0x0000008000000000 W + +SHFL +0x0000000010000000 i20w8 +0x0000000020000000 i34w13 + +SHFL: mode +0x0000000000000000 IDX +0x0000000040000000 UP +0x0000000080000000 DOWN +0x00000000c0000000 BFLY + +IMNMX: mode +0x0000080000000000 XLO +0x0000180000000000 XHI + +ISETP, ISET, ICMP: cmp +0x0002000000000000 LT +0x0004000000000000 EQ +0x0006000000000000 LE +0x0008000000000000 GT +0x000a000000000000 NE +0x000c000000000000 GE + +ISETP, ISET, PSETP, PSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +PSETP, PSET: bool2 +0x0000000000000000 AND +0x0000000001000000 OR +0x0000000002000000 XOR + +ISETP, ISET +0x0000080000000000 X + +LOP: bool +0x0000000000000000 AND +0x0000020000000000 OR +0x0000040000000000 XOR +0x0000060000000000 PASS_B + +LOP: +0x0000010000000000 INV + +LOP: z +0x0000200000000000 Z +0x0000300000000000 NZ + +LOP +0x0007000000000000 noz + +LOP32I: bool +0x0000000000000000 AND +0x0020000000000000 OR +0x0040000000000000 XOR + +PRMT: mode +0x0001000000000000 F4E +0x0002000000000000 B4E +0x0003000000000000 RC8 +0x0004000000000000 ECL +0x0005000000000000 ECR +0x0006000000000000 RC16 + +XMAD: type1 +0x0000000000000000 U16 +0x0001000000000000 S16 + +XMAD: type2 +0x0000000000000000 U16 +0x0002000000000000 S16 + +XMAD: mode +0x0000002000000000 MRG +0x0000001000000000 PSL +0x0008000000000000 CHI +0x0004000000000000 CLO +0x000c000000000000 CSFU +0x0004001000000000 PSL.CLO + +XMAD: modec +0x0004000000000000 CLO +0x0008000000000000 CHI +0x000c000000000000 CSFU +0x0040000000000000 X +0x0080000000000000 PSL +0x0100000000000000 MRG +0x0084000000000000 PSL.CLO + +XMAD +0x0010000000000000 CBCC + +XMAD: r8part +0x0000000000000000 H0 +0x0020000000000000 H1 + +XMAD: r20part +0x0000000000000000 H0 +0x0000000800000000 H1 + +XMAD: r20partx +0x0000000000000000 H0 +0x0010000000000000 H1 + +XMAD: r39part +0x0000000000000000 H0 +0x0010000000000000 H1 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part +0x0000000000000000 B0 +0x0000001000000000 B1 +0x0000002000000000 B2 +0x0000003000000000 B3 +0x0000001000000000 H1 +0x0000000000000000 H0 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part +0x0000000000000000 B0 +0x0000000010000000 B1 +0x0000000020000000 B2 +0x0000000030000000 B3 +0x0000000010000000 H1 +0x0000000000000000 H0 + +VMAD +0x0040000000000000 r8neg +0x0020000000000000 r39neg +0x0008000000000000 SHR_7 +0x0010000000000000 SHR_15 +0x0060000000000000 PO +0x0080000000000000 SAT + +VMNMX +0x0100000000000000 MX + +VADD, VABSDIFF, VMNMX +0x0080000000000000 SAT +0x0040000000000000 UD +0x0040000000000000 SD + +VSET: cmp +0x0040000000000000 LT +0x0080000000000000 EQ +0x00c0000000000000 LE +0x0100000000000000 GT +0x0140000000000000 NE +0x0180000000000000 GE + +VADD, VSET: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VABSDIFF: mode +0x0003000000000000 ACC +0x000b000000000000 MIN +0x0013000000000000 MAX +0x0023000000000000 MRG_16H +0x002b000000000000 MRG_16L +0x0033000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x003b000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMNMX: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1 +0x0000000000000000 U +0x0001000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2 +0x0000000000000000 U +0x0002000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size1 +0x0000000000000000 8 +0x0000004000000000 16 +0x0000006000000000 32 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size2 +0x0000000000000000 8 +0x0000000040000000 16 +0x0000000060000000 32 + +IADD3: type +0x0001000000000000 X +0x0000002000000000 RS +0x0000004000000000 LS + +IADD3: r8part +0x0000000000000000 H0 +0x0000001000000000 H1 + +IADD3: r20part +0x0000000080000000 H0 + +IADD3: r39part +0x0000000200000000 H0 + +IADD3 +0x0008000000000000 r8neg +0x0004000000000000 r20neg +0x0002000000000000 r39neg + +IADD +0x0000080000000000 X +0x0004000000000000 SAT + +IADD, ISCADD +0x0002000000000000 r8neg +0x0001000000000000 r20neg + +IADD32I +0x0100000000000000 r8neg +0x0020000000000000 X + +DEPBAR: SB +0x0000000000000000 SB0 +0x0000000004000000 SB1 +0x0000000008000000 SB2 +0x000000000c000000 SB3 +0x0000000010000000 SB4 +0x0000000014000000 SB5 + +DEPBAR: cmp +0x0000000020000000 LE + +DEPBAR +0x0000000000000001 db0 +0x0000000000000002 db1 +0x0000000000000004 db2 +0x0000000000000008 db3 +0x0000000000000010 db4 +0x0000000000000020 db5 + +F2F, F2I, I2F, I2I: destWidth +0x0000000000000000 8 +0x0000000000000100 16 +0x0000000000000200 32 +0x0000000000000300 64 + +F2F, F2I, I2F, I2I: srcWidth +0x0000000000000000 8 +0x0000000000000400 16 +0x0000000000000800 32 +0x0000000000000c00 64 + +F2F, F2I, I2F, I2I: destSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000001000 S + +F2F, F2I, I2F, I2I: srcSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000002000 S + +F2I, I2F, I2I: r20part +0x0000000000000000 H0 +0x0000040000000000 H1 +0x0000000000000000 B0 +0x0000020000000000 B1 +0x0000040000000000 B2 +0x0000060000000000 B3 + +F2F: r20part +0x0000000000000000 H0 +0x0000020000000000 H1 + +F2F: round +0x0000040000000000 ROUND +0x0000048000000000 FLOOR +0x0000050000000000 CEIL +0x0000058000000000 TRUNC + +F2I: round +0x0000000000000000 ROUND +0x0000008000000000 FLOOR +0x0000010000000000 CEIL +0x0000018000000000 TRUNC + +HADD2, HMUL2, HFMA2: r8part +0x0001000000000000 H0_H0 +0x0001800000000000 H1_H1 +0x0000800000000000 F32 + +HADD2, HMUL2, HFMA2: r20part +0x0000000020000000 H0_H0 +0x0000000030000000 H1_H1 + +HFMA2: r39part +0x0000000800000000 F32 +0x0000001000000000 H0_H0 +0x0000001800000000 H1_H1 + +HADD2, HMUL2, HFMA2 +0x0000000080000000 r20neg +0x0000000040000000 r39neg + +HADD2, HMUL2, HFMA2: mode +0x0002000000000000 F32 +0x0004000000000000 MRG_H0 +0x0006000000000000 MRG_H1 + +HADD2, HMUL2 +0x0000008000000000 FTZ + +HFMA2 +0x0000002000000000 FTZ + +HFMA2 +0x0000004000000000 FMZ + +HADD2, HMUL2, HFMA2 +0x0000000100000000 SAT + +FADD, DADD, FMUL, DMUL, F2F, I2F: rnd +0x0000000000000000 RN +0x0000008000000000 RM +0x0000010000000000 RP +0x0000018000000000 RZ + +DFMA: rnd +0x0000000000000000 RN +0x0004000000000000 RM +0x0008000000000000 RP +0x000c000000000000 RZ + +FFMA: rnd +0x0000000000000000 RN +0x0008000000000000 RM +0x0010000000000000 RP +0x0018000000000000 RZ + +FFMA +0x0020000000000000 FTZ + +F2F, F2I, FADD, FMUL, FMNMX +0x0000100000000000 FTZ + +FADD32I +0x0080000000000000 FTZ + +FMUL32I +0x0020000000000000 FTZ + +FSET +0x0080000000000000 FTZ + +FSETP, FCMP +0x0000800000000000 FTZ + +FADD, FFMA, FMUL, F2F, I2I +0x0004000000000000 SAT + +FADD, DADD, FMNMX, DMNMX, MUFU +0x0001000000000000 r8neg + +FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I +0x0000200000000000 r20neg + +FMUL, DMUL, FFMA, DFMA +0x0001000000000000 r20neg + +FFMA, DFMA +0x0002000000000000 r39neg + +FADD, DADD, FMNMX, DMNMX +0x0000400000000000 r8abs + +FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I +0x0002000000000000 r20abs + +FSETP, DSETP, FSET, DSET +0x0000080000000000 r8neg +0x0000000000000040 r20neg +0x0000000000000080 r8abs +0x0000100000000000 r20abs + +RRO: func +0x0000000000000000 SINCOS +0x0000008000000000 EX2 + +MUFU: func +0x0000000000000000 COS +0x0000000000100000 SIN +0x0000000000200000 EX2 +0x0000000000300000 LG2 +0x0000000000400000 RCP +0x0000000000500000 RSQ +0x0000000000600000 RCP64H +0x0000000000700000 RSQ64H + +FSETP, DSETP, FSET, DSET, FCMP: cmp +0x0001000000000000 .LT +0x0002000000000000 .EQ +0x0003000000000000 .LE +0x0004000000000000 .GT +0x0004000000000000 +0x0005000000000000 .NE +0x0006000000000000 .GE +0x0007000000000000 .NUM +0x0008000000000000 .NAN +0x0009000000000000 .LTU +0x000a000000000000 .EQU +0x000b000000000000 .LEU +0x000c000000000000 .GTU +0x000d000000000000 .NEU +0x000e000000000000 .GEU + +FSETP, DSETP, FSET, DSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +HSETP2: cmp +0x0000002800000000 .NE + +HSETP2: bool +0x0000000000000000 AND + +S2R: sr +0x0000000000000000 LANEID +0x0000000000200000 VIRTCFG +0x0000000000300000 VIRTID +0x0000000002100000 TID.X +0x0000000002200000 TID.Y +0x0000000002300000 TID.Z +0x0000000002500000 CTAID.X +0x0000000002600000 CTAID.Y +0x0000000002700000 CTAID.Z +0x0000000003800000 EQMASK +0x0000000003900000 LTMASK +0x0000000003a00000 LEMASK +0x0000000003b00000 GTMASK +0x0000000003c00000 GEMASK + +CS2R: sr +0x0000000005000000 CLOCKLO +0x0000000005100000 CLOCKHI +0x0000000005200000 GLOBALTIMERLO +0x0000000005300000 GLOBALTIMERHI + +B2R +0x0000e00000000000 nop45 + +BAR +0x0000100000000000 i8w4 +0x0000080000000000 nor20 +0x0000038000000000 nop39 + +BAR: mode +0x0000000000000000 SYNC +0x0000000100000000 ARV +0x0000000200000000 RED + +BAR: red +0x0000000000000000 POPC +0x0000000800000000 AND +0x0000001000000000 OR + +MEMBAR: mode +0x0000000000000000 CTA +0x0000000000000100 GL +0x0000000000000200 SYS + +VOTE: mode +0x0000000000000000 ALL +0x0001000000000000 ANY +0x0002000000000000 EQ + +VOTE +0x00000000000000ff nor0 + +BRA +0x0000000000000080 U + +TLDS: chnls +0x0010000000000000 RGBA + +TLDS +0x0002000000000000 NODEP + +LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS +0x000000000000ff00 nor8 + +LD, ST: type +0x0000000000000000 .U8 +0x0020000000000000 .S8 +0x0040000000000000 .U16 +0x0060000000000000 .S16 +0x0080000000000000 +0x0080000000000000 .32 +0x00a0000000000000 .64 +0x00c0000000000000 .128 + +LD, ST: cache +0x0100000000000000 CG +0x0200000000000000 CS +0x0300000000000000 CV +0x0300000000000000 WT + +LDG, STG, LDS, STS, LDL, STL, LDC: type +0x0000000000000000 .U8 +0x0001000000000000 .S8 +0x0002000000000000 .U16 +0x0003000000000000 .S16 +0x0004000000000000 +0x0004000000000000 .32 +0x0005000000000000 .64 +0x0006000000000000 .128 + +LDG, STG: cache +0x0000400000000000 CG +0x0000800000000000 CI +0x0000800000000000 CS +0x0000c00000000000 CV +0x0000c00000000000 WT + +LDL: cache +0x0000200000000000 CI + +LDC: cache +0x0000100000000000 IL + +LDG, STG, LDS, STS, LDL, STL, LDC +0x0000200000000000 E + +LDS +0x0000100000000000 U + +RED: type +0x0000000000000000 +0x0000000000100000 .S32 +0x0000000000200000 .U64 +0x0000000000300000 .F32.FTZ.RN +0x0000000000400000 .F16x2.FTZ.RN +0x0000000000500000 .S64 + +RED: mode +0x0000000000000000 ADD +0x0000000000800000 MIN +0x0000000001000000 MAX +0x0000000001800000 INC +0x0000000002000000 DEC +0x0000000002800000 AND +0x0000000003000000 OR +0x0000000003800000 XOR + +ATOM: type +0x0000000000000000 +0x0002000000000000 .S32 +0x0004000000000000 .U64 +0x0006000000000000 .F32.FTZ.RN +0x0008000000000000 .F16x2.FTZ.RN +0x000a000000000000 .S64 +0x0002000000000000 .64 + +ATOM, RED +0x0001000000000000 E + +ATOM: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x03f0000000000000 CAS + +ATOMS: type +0x0000000000000000 +0x0000000010000000 .S32 +0x0000000020000000 .U64 +0x0000000030000000 .S64 +0x0010000000000000 .64 + +ATOMS: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x0240000000000000 CAS + +DP4A, DP2A: type1 +0x0000000000000000 U32 +0x0002000000000000 S32 + +DP4A, DP2A: type2 +0x0000000000000000 U32 +0x0000800000000000 S32 + +DP2A: mode +0x0000000000000000 LO +0x0004000000000000 HI +}; + +# The existence of a capture group can map directly to an op code adjustment, or... +# The named capture group value can map the op code adjustmemt from among several options +our %flags; +my (@ops, $flag); +foreach my $line (@flags) +{ + if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)') + { + my $val = hex($1); + # named rules (op: name) + if ($flag) + { $flags{$_}{$flag}{$2} = $val foreach @ops; } + # simple existence check rules + else + { $flags{$_}{$2} = $val foreach @ops; } + } + else + { + my ($ops, $name) = split ':\s*', $line; + @ops = split ',\s*', $ops; + $flag = $name; + } +} + +sub parseInstruct +{ + my ($inst, $grammar) = @_; + return unless $inst =~ $grammar->{rule}; + my %capData = %+; + return \%capData; +} + +# for immediate or constant operands and a given opcode, bits 56-63 get transformed +my %immedOps = map { $_ => 1 } qw(i20 f20 d20); +my %immedCodes = +( + 0x5c => 0x64, + 0x5b => 0x6d, + 0x59 => 0x6b, + 0x58 => 0x68, +); +my %constCodes = +( + c20 => 0x10, + c39 => 0x08, +); +my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4); + +# just pick out the reuse code and nothing else +sub genReuseCode +{ + my $capData = shift; + my $reuse = 0; + $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes; + return $reuse; +} + +# Generate an op code from regex capture data +# if you pass in a test array ref it will populate it with the matching capture groups +sub genCode +{ + my ($op, $grammar, $capData, $test) = @_; + + my $flags = $flags{$op}; + my $code = $grammar->{code}; + my $reuse = 0; + my $immedCode = $immedCodes{$code >> 56}; + + #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I'; + + # process the instruction predicate (if valid for this instuction) + if (exists $capData->{noPred}) + { + delete $capData->{noPred}; + push @$test, 'noPred' if $test; + } + else + { + my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7; + push @$test, 'predNum' if $test; + if (exists $capData->{predNot}) + { + $p |= 8; + push @$test, 'predNot' if $test; + } + $code ^= $p << 16; + delete @{$capData}{qw(predNum predNot)}; + + } + # process the register reuse flags + foreach my $rcode (qw(reuse1 reuse2 reuse3)) + { + if (delete $capData->{$rcode}) + { + $reuse |= $reuseCodes{$rcode}; + push @$test, $rcode if $test; + } + } + + foreach my $capture (keys %$capData) + { + # change the base code for immediate versions of the op + if (exists $immedOps{$capture}) + { $code ^= $immedCode << 56; } + # change the base code for constant versions of the op + elsif (exists $constCodes{$capture}) + { $code ^= $constCodes{$capture} << 56; } + + # if capture group is an operand then process and add that data to code + if (exists $operands{$capture}) + { + # don't process the r20 that comes with the r39s20 capture + unless ($capture eq 'r20' && exists $capData->{r39s20}) + { + $code ^= $operands{$capture}->($capData->{$capture}); + push @$test, $capture if $test; + } + } + + # Add matching flags (an operand might also add/remove a flag) + if (exists $flags->{$capture}) + { + # a named multivalue flag + if (ref $flags->{$capture}) + { + $code ^= $flags->{$capture}{$capData->{$capture}}; + push @$test, "$capture:$capData->{$capture}" if $test; + } + # a simple exists flag + else + { + $code ^= $flags->{$capture}; + push @$test, $capture if $test; + } + } + elsif (!exists $operands{$capture} && !$test) + { + # Every capture group should be acted upon. Missing one is a bug. + warn "UNUSED: $op: $capture: $capData->{$capture}\n"; + warn Dumper($flags); + } + } + + return $code, $reuse; +} + + +my $CtrlRe = qr'(?[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])'; +my $PredRe = qr'(?@!?(?P\d)\s+)'; +my $InstRe = qr"$PredRe?(?\w+)(?[^;]*;)"o; +my $CommRe = qr'(?.*)'; + +sub processAsmLine +{ + my ($line, $lineNum) = @_; + + if ($line =~ m"^$CtrlRe(?\s+)$InstRe$CommRe"o) + { + return { + lineNum => $lineNum, + pred => $+{pred}, + predReg => $+{predReg}, + space => $+{space}, + op => $+{op}, + comment => $+{comment}, + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + ctrl => readCtrl($+{ctrl}, $line), + }; + } + return undef; +} + +sub processSassLine +{ + my $line = shift; + + if ($line =~ m"^\s+/\*(?[0-9a-f]+)\*/\s+$InstRe\s+/\* (?0x[0-9a-f]+)"o) + { + return { + num => hex($+{num}), + pred => $+{pred}, + op => $+{op}, + ins => normalizeSpacing($+{op} . $+{rest}), + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + code => hex($+{code}), + }; + } + return undef; +} + +sub processSassCtrlLine +{ + my ($line, $ctrl, $ruse) = @_; + + return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)'; + + my $code = hex($1); + if (ref $ctrl) + { + push @$ctrl, ($code & 0x000000000001ffff) >> 0; + push @$ctrl, ($code & 0x0000003fffe00000) >> 21; + push @$ctrl, ($code & 0x07fffc0000000000) >> 42; + } + if (ref $ruse) + { + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + } + return 1; +} + +sub replaceXMADs +{ + my $file = shift; + +# XMAD.LO d, a, b, c, x; +# ---------------------- +# XMAD.MRG x, a, b.H1, RZ; +# XMAD d, a, b, c; +# XMAD.PSL.CBCC d, a.H1, x.H1, d; +# ---------------------- +# XMAD d, a, 0xffff, c; +# XMAD.PSL d, a.H1, 0xffff, d; + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD\.LO\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s +%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s; +%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c x comment)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?-?$immed|\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/ + + die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + #TODO: add more XMAD macros + return $file; +} +# convert extra spaces to single spacing to make our re's simplier +sub normalizeSpacing +{ + my $inst = shift; + $inst =~ s/\t/ /g; + $inst =~ s/\s{2,}/ /g; + return $inst; +} + + +# map binary control notation on to easier to work with format. +sub printCtrl +{ + my $code = shift; + + my $stall = ($code & 0x0000f) >> 0; + my $yield = ($code & 0x00010) >> 4; + my $wrtdb = ($code & 0x000e0) >> 5; # write dependency barier + my $readb = ($code & 0x00700) >> 8; # read dependency barier + my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier + + $yield = $yield ? '-' : 'Y'; + $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1; + $readb = $readb == 7 ? '-' : $readb + 1; + $watdb = $watdb ? sprintf('%02x', $watdb) : '--'; + + return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall; +} +sub readCtrl +{ + my ($ctrl, $context) = @_; + my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl; + + $watdb = $watdb eq '--' ? 0 : hex $watdb; + $readb = $readb eq '-' ? 7 : $readb - 1; + $wrtdb = $wrtdb eq '-' ? 7 : $wrtdb - 1; + $yield = $yield eq 'y' || $yield eq 'Y' ? 0 : 1; + $stall = hex $stall; + + die sprintf('wait dep out of range(0x00-0x3f): %x at %s', $watdb, $context) if $watdb != ($watdb & 0x3f); + + return + $watdb << 11 | + $readb << 8 | + $wrtdb << 5 | + $yield << 4 | + $stall << 0; +} + +sub getRegNum +{ + my ($regMap, $regName) = @_; + + return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName}; +} + +sub getVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r0} or return; + + return if $regName eq 'RZ'; + + if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+3); + } + confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4; + return @{$vectors->{$regName}}; + } + return $regName; +} + +sub getAddrVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r8} or return; + + return if $regName eq 'RZ'; + + if (exists $capData->{E}) + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + print Dumper($vectors) unless exists $vectors->{$regName}; + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + return $regName; +} + +__END__ + + + diff --git a/Assembler/MaxAs/microbench/microbench.cpp b/Assembler/MaxAs/microbench/microbench.cpp new file mode 100644 index 0000000..7b0187a --- /dev/null +++ b/Assembler/MaxAs/microbench/microbench.cpp @@ -0,0 +1,212 @@ +// microbench.cpp : Defines the entry point for the console application. +// + +// nvcc -l cuda -o microbench microbench.cpp + +#include +#include +#include +#include +#include + +CUcontext hContext = 0; + +#define CUDA_CHECK( fn ) do { \ + CUresult status = (fn); \ + if ( CUDA_SUCCESS != status ) { \ + const char* errstr; \ + cuGetErrorString(status, &errstr); \ + printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \ + if (hContext) cuCtxDestroy(hContext); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + + +int main(int argc, char* argv[]) +{ + //int iTest = 2896; + //while (iTest < 0x7fff) + //{ + // int iResult = iTest * iTest; + // float fTest = (float)iTest; + // int fResult = (int)(fTest * fTest); + + // printf("i*i:%08x f*f:%08x\n", iResult, fResult); + + // iTest += 0x0800; + //} + //exit(0); + + char deviceName[32]; + int devCount, ordinal, major, minor; + CUdevice hDevice; + + // Initialize the Driver API and find a device + CUDA_CHECK( cuInit(0) ); + CUDA_CHECK( cuDeviceGetCount(&devCount) ); + for (ordinal = 0; ordinal < devCount; ordinal++) + { + CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); + CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); + CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); + CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); + if (major >= 5 && minor >= 2) + { + printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); + break; + } + } + if (ordinal == devCount) + { + printf("No compute 5.0 device found, exiting.\n"); + exit(EXIT_FAILURE); + } + + // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing + int internalTiming = 1; + if (argc > 1) + internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; + + // Second command line arg is the number of blocks + int blocks = 1; + if (argc > 2) + blocks = atoi(argv[2]); + if (blocks < 1) + blocks = 1; + + // Third command line arg is the number of threads + int threads = 128; + if (argc > 3) + threads = atoi(argv[3]); + if (threads > 1024 || threads < 32) + threads = 128; + threads &= -32; + + // Forth command line arg: + double fops = 1.0; + int lanes = 1; + if (argc > 4) + { + if (internalTiming) + { + // The number of lanes to print for each warp + lanes = atoi(argv[4]); + if (lanes > 32 || lanes < 1) + lanes = 1; + } + else + // The number of floating point operations in a full kernel launch + fops = atof(argv[4]); + } + + // Fifth command line arg is the repeat count for benchmarking + int repeat = 1; + if (argc > 5) + repeat = atoi(argv[5]); + if (repeat > 1000 || repeat < 1) + repeat = 1; + + // threads = total number of threads + size_t size = sizeof(int) * threads * blocks; + + // Setup our input and output buffers + int* dataIn = (int*)malloc(size); + int* dataOut = (int*)malloc(size); + int* clocks = (int*)malloc(size); + memset(dataIn, 0, size); + + CUmodule hModule; + CUfunction hKernel; + CUevent hStart, hStop; + CUdeviceptr devIn, devOut, devClocks; + + // Init our context and device memory buffers + CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); + CUDA_CHECK( cuMemAlloc(&devIn, size) ); + CUDA_CHECK( cuMemAlloc(&devOut, size) ); + CUDA_CHECK( cuMemAlloc(&devClocks, size) ); + CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); + CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); + CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); + + CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); + CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); + + // Load our kernel + CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); + CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); + + // Setup the params + void* params[] = { &devOut, &devClocks, &devIn }; + float ms = 0; + + // Warm up the clock (unless under nsight) + if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER + for (int i = 0; i < repeat; i++) + CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); + + // Launch the kernel + CUDA_CHECK( cuEventRecord(hStart, NULL) ); + //CUDA_CHECK( cuProfilerStart() ); + CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); + //CUDA_CHECK( cuProfilerStop() ); + CUDA_CHECK( cuEventRecord(hStop, NULL) ); + CUDA_CHECK( cuEventSynchronize(hStop) ); + CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); + + //CUDA_CHECK( cuCtxSynchronize() ); + + // Get back our results from each kernel + CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); + CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); + + // Cleanup and shutdown of cuda + CUDA_CHECK( cuEventDestroy(hStart) ); + CUDA_CHECK( cuEventDestroy(hStop) ); + CUDA_CHECK( cuModuleUnload(hModule) ); + CUDA_CHECK( cuMemFree(devIn) ); + CUDA_CHECK( cuMemFree(devOut) ); + CUDA_CHECK( cuMemFree(devClocks) ); + CUDA_CHECK( cuCtxDestroy(hContext) ); + hContext = 0; + + // When using just one block, print out the internal timing data + if (internalTiming) + { + int count = 0, total = 0, min = 999999, max = 0; + + int* clocks_p = clocks; + int* dataOut_p = dataOut; + + // Loop over and print results + for (int blk = 0; blk < blocks; blk++) + { + float *fDataOut = reinterpret_cast(dataOut_p); + + for(int tid = 0; tid < threads; tid += 32) + { + // Sometimes we want data on each thread, sometimes just one sample per warp is fine + for (int lane = 0; lane < lanes; lane++) + printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u + + count++; + total += clocks_p[tid]; + if (clocks_p[tid] < min) min = clocks_p[tid]; + if (clocks_p[tid] > max) max = clocks_p[tid]; + } + clocks_p += threads; + dataOut_p += threads; + } + printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); + } + else + { + // For more than one block we're testing throughput and want external timing data + printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); + } + // And free up host memory + free(dataIn); free(dataOut); free(clocks); + + return 0; +} diff --git a/Assembler/MaxAs/microbench/microbench.cu b/Assembler/MaxAs/microbench/microbench.cu new file mode 100644 index 0000000..7d4cd8f --- /dev/null +++ b/Assembler/MaxAs/microbench/microbench.cu @@ -0,0 +1,69 @@ + +// Note this file isn't configured to automatically compile + +#include +#include + +// Build: +// nvcc -l cuda -o microbench microbench.cpp +// nvcc -arch sm_50 -cubin microbench.cu + +// Inspect a cubin (use nvdisasm from cuda 6.5 for best results): +// maxas.pl -e microbench.cubin + +// Insert new sass into cubin +// maxas.pl -i microbench.sass microbench.cubin + +// run it: +// ./microbench + +// Use extern C so C++ doesn't mangle our kernel name +extern "C" __global__ void microbench(int *out, int *clocks, int *in) +{ + __shared__ int share[1024]; + + int tid = threadIdx.x; + int bx = blockIdx.x; + int by = blockIdx.y; + + int start = clock(); + + share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ + + __syncthreads(); + + int end = clock(); + + clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start; + + out[tid] = share[tid ^ 1]; +} + +// A note about using the Cuda Runtime. +// If that's your preference over the driver API then here's what you'd do: + +// In your project properties in the Cuda C/C++ panel: +// -Set the "Keep Processed Files" (-keep) option +// -Add a -v manually to the command line +// If compiling on command line just add -keep -v options to nvcc. +// Rebuild your solution and look in the log for these lines that follow the ptxas step: + +// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda +// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" +// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" + +// You just need to manually run these 3 commands (or add them to a build script) +// after you've modified the cubin generated from the preceeding ptxas command. +// That will give you a new .cu.obj file which will automatically be linked in for you next time you +// build your project (or you could manually run the linker step as well). + +// Having done that you can call your kernel normally using the <<< >>> syntax. +// Debugging will have to be with the sass syntax but that's what you'll want to see anyway. +// With fatbin you can also keep non-maxwell optimized versions of your code. + + +// I just discovered this also works as a shortcut to the above: +// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu + +// The cu kernel definitions above need to have empty bodies. +// And, the cu file must be compiled to a lib seperately before linking. \ No newline at end of file diff --git a/Assembler/MaxAs/microbench/microbench.sass b/Assembler/MaxAs/microbench/microbench.sass new file mode 100644 index 0000000..609274a --- /dev/null +++ b/Assembler/MaxAs/microbench/microbench.sass @@ -0,0 +1,72 @@ +# Kernel: microbench + +// This is a simple micro bench to demonstrate the latency in loading SR_TID.X + + + blockDimX : c[0x0][0x08] + blockDimY : c[0x0][0x0c] + blockDimZ : c[0x0][0x10] + gridDimX : c[0x0][0x14] + gridDimY : c[0x0][0x18] + gridDimZ : c[0x0][0x1c] + + param_out[0] : c[0x0][0x140] + param_out[1] : c[0x0][0x144] + param_clocks[0] : c[0x0][0x148] + param_clocks[1] : c[0x0][0x14c] + param_in[0] : c[0x0][0x150] + param_in[1] : c[0x0][0x154] + + + + + 0-1 : out<0-1> + 2-3 : clocks<0-1> + 4-5 : in<0-1> + 6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x + + + +// Load in our params (not currently used below) +--:-:-:-:1 MOV in0, param_in[0]; +--:-:-:-:1 MOV in1, param_in[1]; + +// Get the first clock value +--:-:-:-:1 CS2R clock1, SR_CLOCKLO; + +// Get the threadId and blockId +// Set the Read-After-Write dependency barrier 1 and 2 +--:-:1:-:1 S2R tid, SR_TID.X; +// Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it +--:-:2:-:2 S2R bid, SR_CTAID.X; + + +// Get the second clock value +// Wait on the depenedency barriers that were set in the prior instruction +// Stall 6 to allow CS2R time to complete before next instruction +// CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks +// This stall count does not factor into the time calculation at all +03:-:-:-:6 CS2R clock2, SR_CLOCKLO; + +// Take the difference of clocks +--:-:-:-:1 IADD clock1, clock2, -clock1; + +// Setup our output addresses +// Stall your pipeline dependencies properly +// Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code +--:-:-:-:6 XMAD offset, bid, blockDimX, tid; + +// LEA is "load effective address" +// The offset param is shifted left 2 and added to the pointers with 64bit math +--:-:-:-:6 LEA clocks0.CC, offset, param_clocks[0], 2; +--:-:-:-:1 LEA.HI.X clocks1, offset, param_clocks[1], RZ, 2; + +--:-:-:-:6 LEA out0.CC, offset, param_out[0], 2; +--:-:-:-:1 LEA.HI.X out1, offset, param_out[1], RZ, 2; + +// Output the results. +// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values +--:-:-:-:1 STG.E [clocks], clock1; +--:-:-:-:1 STG.E [out], offset; # use this to return whatever you like to inspect the results +--:-:-:-:5 EXIT; + diff --git a/Assembler/MaxAs/microbench/shared.pl b/Assembler/MaxAs/microbench/shared.pl new file mode 100755 index 0000000..f760664 --- /dev/null +++ b/Assembler/MaxAs/microbench/shared.pl @@ -0,0 +1,12 @@ +#!/usr/bin/perl +use strict; + +print `maxas.pl -i shared_sts16.sass microbench.cubin`; + +exit if $?; + +print `Release\\microbench.exe i 1 64`; + + +__END__ + diff --git a/Assembler/MaxAs/microbench/shared_lds.sass b/Assembler/MaxAs/microbench/shared_lds.sass new file mode 100644 index 0000000..5f31dcf --- /dev/null +++ b/Assembler/MaxAs/microbench/shared_lds.sass @@ -0,0 +1,122 @@ +# Kernel: microbench +# InsCnt: 18 +# RegCnt: 5 +# SharedSize: 4096 +# BarCnt: 1 +# Params(3): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 + +// This is a simple micro bench to demonstrate the latency in loading SR_TID.X + + + + 0-3 : result, a, b, c + + 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20> + + + +// Load in our params +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R bid, SR_CTAID.X; + +--:-:-:-:1 MOV result, c[0x0][0x0]; +--:-:-:-:1 MOV in, c[0x0][0x100]; + +--:-:-:-:1 CS2R clock1, SR_CLOCKLO; +--:-:-:-:1 MOV result, c[0x0][0x13c]; +--:-:-:-:1 CS2R clock2, SR_CLOCKLO; + +--:-:-:-:1 MOV blockDim, c[0x0][0x8]; +--:-:-:-:1 MOV out, c[0x0][0x140]; +--:-:-:-:1 MOV clocks, c[0x0][0x144]; + + + + + + +03:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 LOP.AND tid128, tid, 128; + +// readAs = ((tid128 >> 4) | tid7) << 4 +--:-:-:-:1 SHR.U32 readAs, tid128, 4; +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid96 >> 3) | tid3) << 4 +--:-:-:-:1 SHR.U32 readBs, tid96, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid3; +#--:-:-:-:1 SHL readBs, readBs, 4; +#--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; + + + + + + +#--:-:-:-:1 LDS.U.128 result, [readBs]; + + + + +01:-:-:-:1 IADD clock1, clock2, -clock1; + + +--:-:-:-:1 XMAD tid, blockDim, bid, tid; +--:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; +--:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; +--:-:-:Y:6 SHL tid, tid, 0x2; + +--:-:-:-:1 IADD clocks, clocks, tid; +--:-:-:-:2 IADD out, out, tid; + +--:-:-:-:1 STG [clocks], clock1; +--:-:-:-:1 STG [out], readBs; +--:-:-:-:5 EXIT; + + + +--:-:-:-:4 LOP.AND tid32, tid, -32; + +--:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; + +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; + + +// readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; +--:-:-:-:1 LOP.AND readAs, tid, 0x80; +--:-:-:-:1 SHR.U32 readAs, readAs, 4; +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid1, tid, 0x1; +--:-:-:-:1 LOP.AND readBs, tid, 0x70; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; + + + \ No newline at end of file diff --git a/Assembler/MaxAs/microbench/shared_sts16.sass b/Assembler/MaxAs/microbench/shared_sts16.sass new file mode 100644 index 0000000..2f6eb39 --- /dev/null +++ b/Assembler/MaxAs/microbench/shared_sts16.sass @@ -0,0 +1,116 @@ +# Kernel: microbench +# InsCnt: 18 +# RegCnt: 5 +# SharedSize: 4096 +# BarCnt: 1 +# Params(3): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 + +// This is a simple micro bench to demonstrate the latency in loading SR_TID.X + + + + 0-3 : result, a, b, c + + 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20> + + + +// Load in our params +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R bid, SR_CTAID.X; + +//--:-:-:-:1 MOV result, c[0x0][0x0]; +//--:-:-:-:1 MOV in, c[0x0][0x100]; +--:-:-:-:1 MOV result, 1; + +--:-:-:-:1 MOV blockDim, c[0x0][0x8]; +--:-:-:-:1 MOV out, c[0x0][0x140]; +--:-:-:-:1 MOV clocks, c[0x0][0x144]; + + +// readAs = ((tid >> 1) & 7) << 4; +03:-:-:-:6 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:6 SHL readAs, readAs, 3; + +// readBs = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024; +--:-:-:-:6 LOP.AND tid1, tid, 1; +--:-:-:-:6 LOP.AND readBs, tid, 0x30; +--:-:-:-:6 SHR.U32 readBs, readBs, 3; +--:-:-:-:6 LOP.OR readBs, readBs, tid1; +--:-:-:-:6 ISCADD readBs, readBs, 0, 3; + + + +///--:-:-:-:1 STS [tid32], result; +//--:-:-:-:1 STS.S16 [tid32 + 2x<32>], result; +//--:-:1:-:2 LDS.U.64 result, [readBs]; + +--:-:-:-:0 CS2R clock1, SR_CLOCKLO; +--:-:1:-:6 LDS.U.64 result, [readAs]; +--:-:-:-:6 CS2R clock2, SR_CLOCKLO; + + +01:-:-:-:1 IADD clock1, clock2, -clock1; + + +--:-:-:-:1 XMAD tid, blockDim, bid, tid; +--:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; +--:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; +--:-:-:Y:6 SHL tid, tid, 0x2; + +--:-:-:-:1 IADD clocks, clocks, tid; +--:-:-:-:2 IADD out, out, tid; + +--:-:-:-:1 STG [clocks], clock1; +--:-:-:-:1 STG [out], result; +--:-:-:-:5 EXIT; + + + +--:-:-:-:4 LOP.AND tid32, tid, -32; + +--:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; + +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; + +03:-:-:-:6 LOP.AND tid31, tid, 31; +--:-:-:-:6 LOP.AND tid32, tid, 32; +--:-:-:-:6 SHL tid32, tid32, 0x2; +--:-:-:-:6 LOP.OR tid32, tid32, tid31; +--:-:-:-:6 SHL tid32, tid32, 0x2; + +// readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; +--:-:-:-:1 LOP.AND readAs, tid, 0x80; +--:-:-:-:1 SHR.U32 readAs, readAs, 4; +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid1, tid, 0x1; +--:-:-:-:1 LOP.AND readBs, tid, 0x70; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; + + + \ No newline at end of file diff --git a/Assembler/MaxAs/microbench/throughput.pl b/Assembler/MaxAs/microbench/throughput.pl new file mode 100755 index 0000000..56df6e7 --- /dev/null +++ b/Assembler/MaxAs/microbench/throughput.pl @@ -0,0 +1,80 @@ +#!/usr/bin/perl +use strict; + +my $loopSize = 512; +my $blocks = 32; +my $loops = 10240000; +my $fileName = 'throughput2.sass'; + +writeSassFile($fileName, $loops); + +#print `maxas.pl -p $fileName`; +#exit; + +print `maxas.pl -i $fileName microbench.cubin`; +exit if $?; + +foreach my $thread128 (2) +{ + my $threads = $thread128 * 128; + my $fops = 2 * $loops * $loopSize * $blocks * $threads; + + my $data = `Release\\microbench.exe e $blocks $threads $fops`; + + my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + + printf "%d %d %d\n", $thread128, $threads, $gflops; +} + +exit; + +sub writeSassFile +{ + my ($filename, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'EOF', $loops; +# Kernel: microbench + + + + 0-10 : result, r1, r2, r3 + 20-27 ~ count, stop + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +--:-:-:-:1 MOV32I r1, 1.0; +--:-:-:-:1 MOV32I r2, 1.0; +--:-:-:-:4 MOV32I r3, 1.0; + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + foreach my $i (0 .. 511) + { + my $yield = ($i + 32) & 63 ? '-' : 'Y'; + + my $stall = $i == 511 ? 0 : 1; + + $out .= "--:-:-:$yield:$stall FFMA result, r1, r2, r3;\n"; + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +EOF + + close $fh; +} + +__END__ + diff --git a/Assembler/MaxAs/microbench/throughput.sass b/Assembler/MaxAs/microbench/throughput.sass new file mode 100644 index 0000000..796502f --- /dev/null +++ b/Assembler/MaxAs/microbench/throughput.sass @@ -0,0 +1,95 @@ +# Kernel: microbench +# InsCnt: 18 +# RegCnt: 5 +# SharedSize: 4096 +# BarCnt: 1 +# Params(3): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 + + + + 8-20 : count + + + +--:-:-:-:1 MOV R0, RZ; +--:-:-:-:1 MOV R1, RZ; +--:-:-:-:1 MOV R2, RZ; +--:-:-:-:1 MOV R3, RZ; +--:-:-:-:1 MOV R4, RZ; +--:-:-:-:1 MOV R5, RZ; +--:-:-:-:1 MOV R6, RZ; +--:-:-:-:1 MOV R7, RZ; +--:-:-:-:1 MOV R8, RZ; +--:-:-:Y:6 MOV count, RZ; + +// This loop is capable of running at 1700 GFlops on GM107. +// You can tweak it to see how register bank conflicts or different control codes +// effect performance. +// With thoughput.pl you can pass params to this code and do some autotuning. +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, 0x19000, PT; +--:-:-:-:1 IADD count, count, 0x1; + + + my $out; + + foreach my $i (0..511) #511 + { + my $y = ($i + 32) & 63 ? '-' : 'Y'; + + $out .= qq| +--:-:-:$y:1 FFMA R0, R1, R2, R3;|; #c[0x0][$c] + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; + +--:-:-:-:5 EXIT; + + + + + open my $fh, 'params.txt'; + my $line = <$fh>; + close $fh; + my ($r1, $r2, $r3) = split "\t", $line; + + 80-95 : out, clocks, in, tid, clock1, clock2, result + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:-:-:1 MOV out, c[0x0][0x140]; +--:-:-:-:1 MOV clocks, c[0x0][0x144]; +01:-:-:-:1 MOV in, c[0x0][0x148]; + + + +--:-:-:-:1 MOV32I f0, 0x3f800000; +--:-:-:-:1 MOV32I f1, 0x3f800000; +--:-:-:-:1 MOV32I f2, 0x3f800000; +--:-:-:-:5 MOV32I f3, 0x3f800000; + +--:-:-:-:1 CS2R clock1, SR_CLOCKLO; + + +--:-:-:-:1 CS2R clock2, SR_CLOCKLO; + +--:-:-:-:6 MOV32I result, 0x457; +--:-:-:-:1 IADD clock1, clock2, -clock1; + + +--:-:-:-:6 SHL tid, tid, 0x2; +--:-:-:-:1 IADD clocks, clocks, tid; +--:-:-:-:1 IADD out, out, tid; + +--:-:-:-:1 STG [clocks], clock1; +--:-:-:-:1 STG [out], R24; + + + \ No newline at end of file diff --git a/Assembler/MaxAs/microbench/throughput2.pl b/Assembler/MaxAs/microbench/throughput2.pl new file mode 100755 index 0000000..ea7e19f --- /dev/null +++ b/Assembler/MaxAs/microbench/throughput2.pl @@ -0,0 +1,158 @@ +#!/usr/bin/perl +use strict; +my %p; + +$p{N} = 8192; +$p{blocking} = 8; +$p{unroll} = 8; +$p{threads} = 64; #256 + +$p{csize} = $p{blocking} * $p{blocking}; +$p{loopSize} = $p{unroll} * $p{csize}; +$p{width} = sqrt($p{csize} * $p{threads}); +$p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); +$p{loops} = $p{N} / $p{unroll}; +$p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; + +my $fileName = 'throughput2.sass'; + +my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); + +#print join("\t", @params), "\n"; +#print join("\t", @p{@params}), "\n"; + +print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; + +writeSassFile($fileName, $p{loopSize}, $p{loops}); + +#print `maxas.pl -p $fileName`; +#exit; + +print `maxas.pl -i $fileName microbench.cubin`; + +exit if $?; + +my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; + +my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + +print $data; + +#printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; + + + + +sub writeSassFile +{ + my ($filename, $loopSize, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'END_SASS', $loops; +# Kernel: microbench + + + + 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> + 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> + 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> + 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> + 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> + + 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> + 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> + + 0-127 : r<0-127> + + 100-101 : count, stop + + //102-112 ~ readAs, readBs, writeS + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +//--:-:-:-:1 MOV writeS, RZ; +//--:-:-:-:1 MOV readAs, RZ; +//--:-:-:-:1 MOV readBs, RZ; + + + return join '', map "--:-:-:-:1 MOV32I r$_, 1.0;\n", 0..95; + + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + + my @cOrder; + #my @swirl = ([0,1],[0,0],[2,0],[2,1]); + my @swirl = ([2,0],[2,1],[0,1],[0,0]); + #my @swirl = ([0,1],[0,0],[1,0],[1,1]); + my @xVals = (0,1,64,65); + #my @xVals = (0,2,64,66); + + my @yVals = (0,2,64,66); + + foreach my $y (@yVals) + { + foreach my $x (@xVals) + { + push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl; + } + @xVals = reverse @xVals; + } + + foreach my $j (0..7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + + my %%insert; + + #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; + + $insert{c62} = + "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . + "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . + "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . + "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . + "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; + + foreach my $c (0 .. 63) + { + my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/; + my $ins = $insert{"c$c"} || ''; + my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || + my $yield = $c == 32 ? 'Y' : '-'; + my $wait = '--'; #$c ? '--' : '01'; + + $out .= "$wait:-:-:$yield:$stall FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins"; + } + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +END_SASS + + close $fh; +} + +__END__ + + my %%insert = ( + c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", + c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", + c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", + c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", + ); \ No newline at end of file diff --git a/Assembler/MaxAs/microbench/throughput2.sass b/Assembler/MaxAs/microbench/throughput2.sass new file mode 100644 index 0000000..3db5130 --- /dev/null +++ b/Assembler/MaxAs/microbench/throughput2.sass @@ -0,0 +1,47 @@ +# Kernel: microbench + + + + 0-10 : result, r1, r2, r3 + 20-27 ~ count, stop + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, 102400; +--:-:-:-:1 MOV32I r1, 1.0; +--:-:-:-:1 MOV32I r2, 1.0; +--:-:-:-:4 MOV32I r3, 1.0; + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + foreach my $i (0 .. 511) + { + my $yield = ($i + 32) & 63 ? '-' : 'Y'; + + my $stall = $i == 511 ? 0 : 1; + + #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; + + #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; + #$out .= "--:-:-:-:1 MOV result, RZ;\n"; + + $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; + #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; + #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; diff --git a/Assembler/MaxAs/microbench/throughput3.pl b/Assembler/MaxAs/microbench/throughput3.pl new file mode 100755 index 0000000..ff9077a --- /dev/null +++ b/Assembler/MaxAs/microbench/throughput3.pl @@ -0,0 +1,90 @@ +#!/usr/bin/perl +use strict; + +my %data; + +foreach my $thread128 (1 .. 8) +{ + foreach my $size64 (8 .. 16) + { + my $loopSize = $size64 * 64; + my $loops = int(2 * 1638400 / ($size64 * $thread128)); + + my $blocks = 16; + my $threads = $thread128 * 128; + my $fops = 2 * $loops * $loopSize * $blocks * $threads; + my $fileName = 'throughput2.sass'; + + #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops; + #next; + + writeSassFile($fileName, $loopSize, $loops); + + `maxas.pl -i $fileName microbench.cubin`; + + exit if $?; + + my $data = `Release\\microbench.exe e $blocks $threads $fops`; + + my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + + printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; + + push @{$data{$loopSize}}, $gflops; + } +} +print join("\t", 'size', 1 .. 8), "\n"; +foreach my $loopSize (sort {$a <=> $b} keys %data) +{ + print join("\t", $loopSize, @{$data{$loopSize}}), "\n"; +} + +exit; + +sub writeSassFile +{ + my ($filename, $loopSize, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'EOF', $loops, $loopSize, $loopSize; +# Kernel: microbench + + + + 0-10 : result, r1, r2, r3, count, stop + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +--:-:-:-:1 MOV32I r1, 1.0; +--:-:-:-:1 MOV32I r2, 1.0; +--:-:-:-:4 MOV32I r3, 1.0; + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + foreach my $i (0 .. %d) + { + my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y'; + + $out .= "--:-:-:$y:1 FFMA result, r1, r2, r3;\n"; + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +EOF + + close $fh; +} + +__END__ + diff --git a/Assembler/MaxAs/microbench/throughput4.pl b/Assembler/MaxAs/microbench/throughput4.pl new file mode 100755 index 0000000..8f8760c --- /dev/null +++ b/Assembler/MaxAs/microbench/throughput4.pl @@ -0,0 +1,120 @@ +#!/usr/bin/perl +use strict; + +my $loopSize = 512; +my $blocks = 64; +my $loops = 102400; +my $fileName = 'throughput2.sass'; + +writeSassFile($fileName, $loops); + +#print `maxas.pl -p $fileName`; +#exit; + +print `maxas.pl -i $fileName microbench.cubin`; +exit if $?; + +foreach my $thread128 (4) +{ + my $threads = $thread128 * 128; + my $fops = 2 * $loops * $loopSize * $blocks * $threads; + + print "./microbench e $blocks $threads $fops\n\n"; + my $data = `./microbench e $blocks $threads $fops`; + exit($?) if $?; + + my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + + printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0; +} + +exit; + +sub writeSassFile +{ + my ($filename, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'EOF', $loops; +# Kernel: microbench + + + + 0-10 : result, r1, r2, r3 + 20-27 ~ count, stop + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +--:-:-:-:1 MOV32I r1, 1.0; +--:-:-:-:1 MOV32I r2, 1.0; +--:-:-:-:4 MOV32I r3, 1.0; + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + foreach my $i (0 .. 511) + { + my $yield = ($i + 32) & 63 ? '-' : 'Y'; + + my $stall = $i == 511 ? 0 : 1; + + #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; + + #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; + #$out .= "--:-:-:-:1 MOV result, RZ;\n"; + + $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; + #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; + #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +EOF + + close $fh; +} + +__END__ + +VMAD.U8.U8 + +dddd 2655 / 4968 = 53.4% +1d1d 4594 / 4968 = 92.4% +11d 4746 / 4968 = 95.5% +111d 4841 / 4968 = 97.4% + +block context switches are a little more expensive than thread context switches + +stall codes: + +f : 13 clocks +e : 8 clocks +d : 6 clocks +c : 8 clocks, no yield +b : 11 clocks +a : 10 clocks +9 : 9 clocks +8 : 8 clocks +7 : 7 clocks +6 : 6 clocks +5 : 5 clocks +4 : 4 clocks +3 : 3 clocks +2 : 2 clocks +1 : 1 clocks, no yield +0 : 0 clocks, no yield, dual issue \ No newline at end of file diff --git a/Assembler/MaxAs/microbench/throughput5.pl b/Assembler/MaxAs/microbench/throughput5.pl new file mode 100755 index 0000000..f9bda8e --- /dev/null +++ b/Assembler/MaxAs/microbench/throughput5.pl @@ -0,0 +1,164 @@ +#!/usr/bin/perl +use strict; +my %p; + +$p{N} = 8192; +$p{blocking} = 8; +$p{unroll} = 8; +$p{threads} = 64; #256 + +$p{csize} = $p{blocking} * $p{blocking}; +$p{loopSize} = $p{unroll} * $p{csize}; +$p{width} = sqrt($p{csize} * $p{threads}); +$p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); +$p{loops} = $p{N} / $p{unroll}; +$p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; + +my $fileName = 'throughput2.sass'; + +my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); + +#print join("\t", @params), "\n"; +#print join("\t", @p{@params}), "\n"; + +print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; + +writeSassFile($fileName, $p{loopSize}, $p{loops}); + +#print `maxas.pl -p $fileName`; +#exit; + +print `maxas.pl -i $fileName microbench.cubin`; + +exit if $?; + +my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; + +my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + +print $data; + +#printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; + + + + +sub writeSassFile +{ + my ($filename, $loopSize, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'END_SASS', $loops; +# Kernel: microbench + + + + 1, 9, 2,10,17,25,18,26 : cy0x<0-7> + 5,13, 6,14,21,29,22,30 : cy1x<0-7> + 3,11, 0, 8,19,27,16,24 : cy2x<0-7> + 7,15, 4,12,23,31,20,28 : cy3x<0-7> + 35,43,32,40,51,59,48,56 : cy4x<0-7> + 39,47,36,44,55,63,52,60 : cy5x<0-7> + 33,41,34,42,49,57,50,58 : cy6x<0-7> + 37,45,38,46,53,61,54,62 : cy7x<0-7> + + 64-71 : j0Ax<0-3>, j0By<0-3> + 72-79 : j1Ax<0-3>, j1By<0-3> + + 0-79 : r<0-79> + + 100-101 : count, stop + + //102-112 ~ readAs, readBs, writeS + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +//--:-:-:-:1 MOV writeS, RZ; +//--:-:-:-:1 MOV readAs, RZ; +//--:-:-:-:1 MOV readBs, RZ; + + + return join '', map "--:-:-:-:1 MOV r$_, RZ;\n", 0..63; + + + + return join '', map "--:-:-:-:1 MOV32I r$_, 0x00010001;\n", 64..79; + + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + my @swirl1 = ([0,0],[0,4],[4,4],[4,0]); + my @swirl2 = ([0,0],[1,0],[1,1],[0,1]); + my @swirl3 = ([0,2],[2,2],[2,0],[0,0]); + + my @cOrder; + foreach my $s1 (@swirl1) + { + foreach my $s2 (@swirl2) + { + foreach my $s3 (@swirl3) + { + push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]]; + } + } + } + + foreach my $j (0..7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + + my %%insert; + + #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; + + $insert{c62} = + "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . + "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . + "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . + "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . + "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + my $ins = $insert{"c$c"} || ''; + my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || + my $yield = $c == 32 ? 'Y' : '-'; + my $wait = '--'; #$c ? '--' : '01'; + + my $xReg = $x >> 1; + my $yReg = $y >> 1; + my $xPart = $x & 1 ? '.H1' : ''; + my $yPart = $y & 1 ? '.H1' : ''; + + $out .= sprintf "$wait:-:-:$yield:$stall XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x, $odd,$xReg,$xPart, $odd,$yReg,$yPart, $y,$x, $ins; + } + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +END_SASS + + close $fh; +} + +__END__ + + my %%insert = ( + c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", + c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", + c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", + c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", + ); \ No newline at end of file diff --git a/Assembler/MaxAs/microbench/xmad.pl b/Assembler/MaxAs/microbench/xmad.pl new file mode 100755 index 0000000..6aadb89 --- /dev/null +++ b/Assembler/MaxAs/microbench/xmad.pl @@ -0,0 +1,12 @@ +#!/usr/bin/perl +use strict; + +print `maxas.pl -i xmad2.sass microbench.cubin`; + +exit if $?; + +print `./microbench i 1 128`; + + +__END__ + diff --git a/Assembler/MaxAs/microbench/xmad2.sass b/Assembler/MaxAs/microbench/xmad2.sass new file mode 100644 index 0000000..f0ce936 --- /dev/null +++ b/Assembler/MaxAs/microbench/xmad2.sass @@ -0,0 +1,144 @@ +# Kernel: microbench +# InsCnt: 18 +# RegCnt: 5 +# SharedSize: 4096 +# BarCnt: 1 +# Params(3): +# ord:addr:size:align +# 0:0x140:8:0 +# 1:0x148:8:0 +# 2:0x150:8:0 +# +# Instructions: + + + blockDimX : c[0x0][0x8] + blockDimY : c[0x0][0xc] + blockDimZ : c[0x0][0x10] + gridDimX : c[0x0][0x14] + gridDimY : c[0x0][0x18] + gridDimZ : c[0x0][0x1c] + + param_out[0] : c[0x0][0x140] + param_out[1] : c[0x0][0x144] + param_clocks[0] : c[0x0][0x148] + param_clocks[1] : c[0x0][0x14c] + param_in[0] : c[0x0][0x150] + param_in[1] : c[0x0][0x154] + + + + + 0-1 : out<0-1> + 2-3 : clocks<0-1> + 4-15 : result, result2, tid, bid, blockDim, clock1, clock2, scale, s + 16-24 : a, b, c, x + + + +// Load in our params +--:-:-:-:1 MOV out0, param_out[0]; +--:-:-:-:1 MOV out1, param_out[1]; +--:-:-:-:1 MOV clocks0, param_clocks[0]; +--:-:-:-:1 MOV clocks1, param_clocks[1]; +//--:-:-:-:1 MOV in, c[0x0][0x148]; +--:-:-:-:1 MOV blockDim, blockDimX; + +--:-:-:-:1 PSETP.AND.AND P0, PT, !PT, PT, PT; + +--:-:-:-:6 MOV32I result, 0xffffffff; +--:-:-:-:6 MOV32I result2, 0x0; +--:-:-:-:1 MOV32I a, 1; +--:-:-:-:1 MOV32I b, 1; +--:-:-:-:6 MOV32I c, 0x0; + +// (127 - scale) << 23 +//--:-:-:-:6 MOV32I scale, 28; +//--:-:-:-:6 IADD scale, -scale, 127; +//--:-:-:-:6 SHL scale, scale, 23; + + +//--:-:-:-:6 MOV32I c, 0x4f765432; + +//--:-:1:-:2 LDG.CI.128 a, [in]; + +//01:-:-:-:6 VMAD.S16.S16 result, a, b, c; + +//--:-:-:-:6 MOV result, a; + +// a >> 16 | (b & 0xffff0000) + +//--:-:-:-:6 SHR.U32 result, a, 16; +//--:-:-:-:6 LOP3.LUT result, result, b, c, 0xf8; + +//--:-:-:-:6 I2I.S32.S16 result, a.H1; + +//--:-:-:Y:d IADD result.CC, a, -c; +//--:-:-:Y:2 IADD.X result2, b, -RZ; + +//--:-:-:-:6 SHR result, a, 1; + +//--:-:-:-:6 BFI result, b, 0x1010, a; + +--:-:-:-:1 CS2R clock1, SR_CLOCKLO; + +//--:-:-:-:6 XMAD.S16.S16 c, a, b, RZ; +//--:-:-:-:6 ISET.LT.AND s, c, RZ, PT; +//--:-:-:-:6 IADD result.CC, c, result; +//--:-:-:-:6 IADD.X result2, s, result2; + +//--:-:-:-:6 XMAD.S16.S16 result.CC, a, b, result; +//--:-:-:-:6 IADD.X result2, result2, RZ; + +//--:-:-:-:6 SHF.R.S64 result, result, 1, result2; +//--:-:-:-:6 MOV32I result2, 0; + +--:-:-:-:f LOP.AND.NZ P0, RZ, result, 1; + +--:-:-:-:6 @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result; + +//--:-:1:-:d I2F.F32.S32 result2, a; +//01:-:-:-:6 FMUL result2, result2, scale; +//01:-:2:-:d F2I.S32.F32 result, result2; + +02:-:-:-:6 CS2R clock2, SR_CLOCKLO; + +//F2I = "^$pred?F2I$ftz$x2x$round $r0, $cr20;" +//I2F = "^$pred?I2F$x2x$rnd $r0, $cr20;" +//x2x = "\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)" +//rnd = "(?:\.(?RN|RM|RP|RZ))?" +//round = "(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?" +//r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" +//r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" + + +//--:-:-:-:1 XMAD.MRG x, a, b.H1, RZ; +//--:-:-:-:6 XMAD result, a.H1, b.H1, c; +//--:-:-:-:1 XMAD.PSL.CBCC result, a.H1, x.H1, result; + +// Get the first clock value + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:2 S2R bid, SR_CTAID.X; + + + +// Take the difference of clocks +--:-:-:-:1 IADD clock1, clock2, -clock1; + +// Setup our output addresses +// Stall your pipeline dependencies properly +03:-:-:-:1 XMAD tid, blockDim, bid, tid; +--:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; +--:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; +--:-:-:Y:6 SHL tid, tid, 0x2; + +--:-:-:-:1 IADD clocks, clocks, tid; +--:-:-:-:1 IADD out, out, tid; + +// Output the results. +// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values +--:-:-:-:1 STG.E [clocks], result2; +--:-:-:-:1 STG.E [out], result; +--:-:-:-:5 EXIT; + diff --git a/Assembler/MaxAs/pm_to_blib b/Assembler/MaxAs/pm_to_blib new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/MaxAs/sgemm/batched_gemm.xlsx b/Assembler/MaxAs/sgemm/batched_gemm.xlsx new file mode 100644 index 0000000..c88f0a7 Binary files /dev/null and b/Assembler/MaxAs/sgemm/batched_gemm.xlsx differ diff --git a/Assembler/MaxAs/sgemm/cublas_sgemm.ptx b/Assembler/MaxAs/sgemm/cublas_sgemm.ptx new file mode 100644 index 0000000..8edec86 --- /dev/null +++ b/Assembler/MaxAs/sgemm/cublas_sgemm.ptx @@ -0,0 +1,65 @@ +.version 4.1 +.target sm_50 +.address_size 64 + +// ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx + +// You can use maxas to insert cublas_device.lib code into a cubin built from this ptx: + +// From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib + +// cuobjdump -lelf cublas_device.lib | find "sm_50" + +// cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib + +// maxas -l maxwell_sgemm.asm.sm_50.cubin + +// maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass +// maxas -e -k maxwell_sgemm_128x64_nt maxwell_sgemm_128x64_nt.sass + +// maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin +// maxas -i maxwell_sgemm_128x64_nt.sass cublas_sgemm.cubin + +// The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas. + +.visible .entry maxwell_sgemm_128x128_nt( + .param .u64 .ptr.global.align 8 param_A, + .param .u64 .ptr.global.align 8 param_B, + .param .u64 .ptr.global.align 8 param_C, + .param .s32 param_lda, + .param .s32 param_ldb, + .param .s32 param_ldc, + .param .s32 param_k, + .param .u64 .ptr.global.align 8 param_Alpha, + .param .u64 .ptr.global.align 8 param_Beta, + .param .s32 param_alpha, + .param .s32 param_beta, + .param .s32 param_flag +) +.reqntid 256 +{ + .shared .align 16 .b8 share[16384]; + + ret; +} + +.visible .entry maxwell_sgemm_128x64_nt( + .param .u64 .ptr.global.align 8 param_A, + .param .u64 .ptr.global.align 8 param_B, + .param .u64 .ptr.global.align 8 param_C, + .param .s32 param_lda, + .param .s32 param_ldb, + .param .s32 param_ldc, + .param .s32 param_k, + .param .u64 .ptr.global.align 8 param_Alpha, + .param .u64 .ptr.global.align 8 param_Beta, + .param .s32 param_alpha, + .param .s32 param_beta, + .param .s32 param_flag +) +.reqntid 128 +{ + .shared .align 16 .b8 share[12288]; + + ret; +} diff --git a/Assembler/MaxAs/sgemm/new.cubin b/Assembler/MaxAs/sgemm/new.cubin new file mode 100644 index 0000000..6a1572b Binary files /dev/null and b/Assembler/MaxAs/sgemm/new.cubin differ diff --git a/Assembler/MaxAs/sgemm/sgemm.cpp b/Assembler/MaxAs/sgemm/sgemm.cpp new file mode 100644 index 0000000..f2127d8 --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm.cpp @@ -0,0 +1,480 @@ +// sgemm.cpp : Defines the entry point for the console application. +// + +#include +#include +#include +#include +#include +#include + +CUcontext hContext = 0; +cublasHandle_t hCublas = 0; + +float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat = 1, int printVars = 0); +float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat); +void gflops(const char* ident, int N, float ms, int repeat); +void test(float* C, float* T, int N, size_t size); + +#define REPEAT_BLOCK 2000 + +#define CUDA_CHECK( fn ) do { \ + CUresult status = (fn); \ + if ( CUDA_SUCCESS != status ) { \ + const char* errstr; \ + cuGetErrorString(status, &errstr); \ + printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \ + if (hCublas) cublasDestroy(hCublas); \ + if (hContext) cuCtxDestroy(hContext); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUBLAS_CHECK( fn ) do { \ + cublasStatus_t status = (fn); \ + if ( CUBLAS_STATUS_SUCCESS != status ) { \ + printf("Cublas Failure (line %d of file %s):\n\t%s returned %d\n", __LINE__, __FILE__, #fn, status); \ + if (hCublas) cublasDestroy(hCublas); \ + if (hContext) cuCtxDestroy(hContext); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +int main(int argc, char* argv[]) +{ + char deviceName[32]; + int count, ordinal, major, minor; + CUdevice hDevice; + CUevent hStart, hStop; + CUdeviceptr devA, devB, devC, devT, otherDevA, otherDevB; + + // Initialize the Driver API and find a device + CUDA_CHECK( cuInit(0) ); + CUDA_CHECK( cuDeviceGetCount(&count) ); + for (ordinal = 0; ordinal < count; ordinal++) + { + CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); + CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); + CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); + CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); + if (major >= 5 && minor >= 2) + { + //printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); + break; + } + } + if (ordinal == count) + { + printf("No compute 5.0 device found, exiting.\n"); + exit(EXIT_FAILURE); + } + + // First command line arg is the size of N divided by 128 + int thread128 = 64; + if (argc > 1) + thread128 = atoi(argv[1]); + if (thread128 > 64 || thread128 < 1) + thread128 = 64; + + // Second command line arg is the repeat count for benchmarking + int repeat = 1; + if (argc > 2) + repeat = atoi(argv[2]); + if (repeat > 10000 || repeat < 1) + repeat = 1; + + // Third command line arg is the normalized float size + CUarray_format format = CU_AD_FORMAT_FLOAT; + if (argc > 3) + format = (CUarray_format)atoi(argv[3]); + if (format != CU_AD_FORMAT_FLOAT && format != CU_AD_FORMAT_UNSIGNED_INT16 && format != CU_AD_FORMAT_UNSIGNED_INT8) + format = CU_AD_FORMAT_FLOAT; + + // Forth command line arg is for printf debugging + int printVars = 0; + if (argc > 4) + printVars = atoi(argv[4]); + if (printVars > 100 || printVars < 1) + printVars = 0; + + int N = thread128 * 128; + float alpha = 1, beta = 0, ms = 1; + size_t sizeOther = N * N; + size_t sizeFloat = sizeOther * 4; + + float* A = (float*)malloc(sizeFloat); + float* B = (float*)malloc(sizeFloat); + float* C = (float*)malloc(sizeFloat); + float* T = (float*)malloc(sizeFloat); + float *otherA, *otherB; + + //int counter = 0; + //srand((unsigned int)time(0)); + for(int i = 0; i < N * N; i++) // + { + //A[i] = (float)rand() / (float)RAND_MAX; + //B[i] = (float)rand() / (float)RAND_MAX; + A[i] = B[i] = 1.0f; // * (i & 3) + 1.0f; + //A[i] = 1.0f; + //B[i * N + counter++] = 1.0f; // identity matrix + } + + if (format == CU_AD_FORMAT_FLOAT) + { + sizeOther *= 4; + otherA = A; + otherB = B; + } + else if (format == CU_AD_FORMAT_UNSIGNED_INT16) + { + sizeOther *= 2; + unsigned short* othera = (unsigned short*)malloc(sizeOther); + unsigned short* otherb = (unsigned short*)malloc(sizeOther); + for(int i = 0; i < N * N; i++) + othera[i] = otherb[i] = 65535; + + otherA = reinterpret_cast(othera); + otherB = reinterpret_cast(otherb); + } + else // (format == CU_AD_FORMAT_UNSIGNED_INT8) + { + otherA = (float*)malloc(sizeOther); + otherB = (float*)malloc(sizeOther); + memset(otherA, 255, sizeOther); + memset(otherB, 255, sizeOther); + } + + CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); + //CUBLAS_CHECK( cublasCreate(&hCublas) ); + + CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT + CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); + + CUDA_CHECK( cuMemAlloc(&devA, sizeFloat) ); + CUDA_CHECK( cuMemAlloc(&devB, sizeFloat) ); + CUDA_CHECK( cuMemAlloc(&devC, sizeFloat) ); + CUDA_CHECK( cuMemAlloc(&devT, sizeFloat) ); + + CUDA_CHECK( cuMemcpyHtoD(devA, A, sizeFloat) ); + CUDA_CHECK( cuMemcpyHtoD(devB, B, sizeFloat) ); + CUDA_CHECK( cuMemsetD8(devC, 0, sizeFloat) ); + CUDA_CHECK( cuMemsetD8(devT, 0, sizeFloat) ); + + if (format == CU_AD_FORMAT_FLOAT) + { + otherDevA = devA; + otherDevB = devB; + } + else + { + CUDA_CHECK( cuMemAlloc(&otherDevA, sizeOther) ); + CUDA_CHECK( cuMemAlloc(&otherDevB, sizeOther) ); + CUDA_CHECK( cuMemcpyHtoD(otherDevA, otherA, sizeOther) ); + CUDA_CHECK( cuMemcpyHtoD(otherDevB, otherB, sizeOther) ); + } + + // Warm up the clock (unless under nsight) + //if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER + // for (int i = 0; i < 3; i++) + // CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast(devA), N, reinterpret_cast(devB), N, &beta, reinterpret_cast(devT), N) ); + + // Launch our kernel + ms = assemblySgemm("sgemm_kernel_64", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars); + gflops("Max64 ", N, ms, repeat); + + ms = assemblySgemm("sgemm_kernel_128", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars); + gflops("Max128", N, ms, repeat); + + //ms = cublasSgemm("maxwell_sgemm_128x64_nt", devT, devA, devB, N, hStart, hStop, repeat); + //gflops("Cub64 ", N, ms, repeat); + + //ms = cublasSgemm("maxwell_sgemm_128x128_nt", devT, devA, devB, N, hStart, hStop, repeat); + //gflops("Cub128", N, ms, repeat); + + // Run cublas again for the same repeat count for comparison + //CUDA_CHECK( cuEventRecord(hStart, NULL) ); + //for (int i = 0; i < repeat; i++) + // CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast(devA), N, reinterpret_cast(devB), N, &beta, reinterpret_cast(devT), N) ); + //CUDA_CHECK( cuEventRecord(hStop, NULL) ); + //CUDA_CHECK( cuEventSynchronize(hStop) ); + //CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); + //gflops("Cublas", N, ms, repeat); + + // Get back our results from each kernel + CUDA_CHECK( cuMemcpyDtoH(C, devC, sizeFloat) ); + CUDA_CHECK( cuMemcpyDtoH(T, devT, sizeFloat) ); + + // Cleanup and shutdown of cuda + CUDA_CHECK( cuMemFree(devA) ); + CUDA_CHECK( cuMemFree(devB) ); + CUDA_CHECK( cuMemFree(devC) ); + CUDA_CHECK( cuMemFree(devT) ); + if (format != CU_AD_FORMAT_FLOAT) + { + CUDA_CHECK( cuMemFree(otherDevA) ); + CUDA_CHECK( cuMemFree(otherDevB) ); + } + + CUDA_CHECK( cuEventDestroy(hStart) ); + CUDA_CHECK( cuEventDestroy(hStop) ); + + //CUBLAS_CHECK( cublasDestroy(hCublas) ); + //hCublas = 0; + CUDA_CHECK( cuCtxDestroy(hContext) ); + hContext = 0; + + // compare C and T for accuracy + test(C, T, N, sizeFloat); + + // And free up host memory + free(A); free(B); free(C); free(T); + + if (format != CU_AD_FORMAT_FLOAT) + { + free(otherA); + free(otherB); + } + + return 0; +} + +// Our kernel wrapper function +float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat, int printVars) +{ + // Configure our x and y grid dimensions (assume nice square matrixes). + // Each block gets 128 tracks from A and 128 tracks from B. + // Each of the 256 threads calculates 64 elements of that 128x128 sub matrix of C. + // See Figure 2 here to get the gist of things (we use a different mapping to maximize LDS.128 usage): + // http://icl.cs.utk.edu/projectsfiles/magma/pubs/fermi_gemm.pdf + + int threads, width; + if (strcmp(kernel, "sgemm_kernel_64") == 0) + { + threads = 64; + width = 64; + } + else + { + threads = 256; + width = 128; + } + + int gridDimXY = N / width + (N % width != 0); + int blocks = gridDimXY * gridDimXY; + + // Setup out debug printf output buffer + CUdeviceptr devD = NULL; + int* D = NULL; + int sizeD = 0; + + if (printVars) + { + sizeD = blocks * threads * printVars * sizeof(int); + D = (int*)malloc(sizeD); + + CUDA_CHECK( cuMemAlloc(&devD, sizeD) ); + CUDA_CHECK( cuMemsetD8(devD, 0, sizeD) ); + } + + // Load the cubin + CUmodule hModule; + CUDA_CHECK( cuModuleLoad(&hModule, "sgemm.cubin") ); + + // Load the textures + CUtexref texA, texB; + CUDA_CHECK( cuModuleGetTexRef(&texA, hModule, "texA") ); + CUDA_CHECK( cuModuleGetTexRef(&texB, hModule, "texB") ); + + // Configure the textures + CUDA_CHECK( cuTexRefSetFormat(texA, format, 4) ); + CUDA_CHECK( cuTexRefSetFormat(texB, format, 4) ); + + CUDA_CHECK( cuTexRefSetAddress(NULL, texA, devA, size) ); + CUDA_CHECK( cuTexRefSetAddress(NULL, texB, devB, size) ); + + // Load the kernel function + CUfunction hKernel; + CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) ); + + // Setup the params + float alpha = 1.0f; + void* params[] = { &devC, &N, &N, &N, &N, &N, &N, &alpha, &devD }; + + float totalTime = 0; + // Launch the kernel repeat times.. but break it up into pieces so as not to lock things up. + while (repeat > 0) + { + float ms; + int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat; + CUDA_CHECK( cuEventRecord( hStart, NULL ) ); + + for (int i = 0; i < r; i++) + CUDA_CHECK( cuLaunchKernel(hKernel, gridDimXY, gridDimXY, 1, threads, 1, 1, 0, 0, params, 0) ); + + CUDA_CHECK( cuEventRecord( hStop, NULL ) ); + CUDA_CHECK( cuEventSynchronize( hStop ) ); + CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) ); + totalTime += ms; + repeat -= r; + } + + + CUDA_CHECK( cuModuleUnload(hModule) ); + + // And here we print out the debug info if requested: + if (printVars) + { + CUDA_CHECK( cuMemcpyDtoH(D, devD, sizeD) ); + CUDA_CHECK( cuMemFree(devD) ); + int *iD = D; + float *fD = reinterpret_cast(D); + unsigned int *uD = reinterpret_cast(D); + + for (int by = 0; by < gridDimXY; by++) + { + for (int bx = 0; bx < gridDimXY; bx++) + { + unsigned int clock = 0xffffffff, sm = 0; + + for (int tid = 0; tid < threads; tid++) + { + //printf("by: %3d, bx: %3d, tid:%3d, rA:%5d, rB:%5d, wr:%5d, rd:%5d, cx:%5d, cy:%5d, ci:%5d, c:%.2f\n", + //printf("by: %3d, bx: %3d, tid:%3d, t0:%5d, end:%5d, k:%5d, tid2:%5d, tid15:%5d, ldx:%5d, t2:%5d, t4:%5d\n", + // by, bx, tid, iD[0], iD[1], iD[2], iD[3], iD[4], iD[5], iD[6], iD[7] + //); + if (uD[1] < clock) clock = uD[1]; + sm = uD[0]; + + iD += printVars; + fD += printVars; + uD += printVars; + } + printf("%02d %08u %d %d\n", sm, clock, by, bx); + } + } + free(D); + } + + return totalTime; +} + +typedef struct dPointer +{ + CUdeviceptr lo; + CUdeviceptr hi; +} dPointer; + +float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat) +{ + int threads, gridX, gridY; + if (strcmp(kernel, "maxwell_sgemm_128x64_nt") == 0) + { + threads = 128; + gridX = N / 128 + (N % 128 != 0); + gridY = N / 64 + (N % 64 != 0); + } + else + { + threads = 256; + gridX = gridY = N / 128 + (N % 128 != 0); + } + int blocks = gridX * gridY; + + // Load the cubin + // See cublas_sgemm.ptx for info on how to build this. + CUmodule hModule; + CUDA_CHECK( cuModuleLoad(&hModule, "cublas_sgemm.cubin") ); + + // Load the kernel function + CUfunction hKernel; + CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) ); + + // Setup the params + // I should probably be working in 64 bits... + dPointer dA = { devA, 0 }; + dPointer dB = { devB, 0 }; + dPointer dC = { devC, 0 }; + + int flag = 0; + float alpha = 1.0; + float beta = 0.0; + + void* params[] = { &dA, &dB, &dC, &N, &N, &N, &N, &dA, &dA, &alpha, &beta, &flag }; + + float totalTime = 0; + // Launch the kernel repeat times.. but break it up into pieces so as not to lock things up. + while (repeat > 0) + { + float ms; + int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat; + CUDA_CHECK( cuEventRecord( hStart, NULL ) ); + + for (int i = 0; i < r; i++) + CUDA_CHECK( cuLaunchKernel(hKernel, gridX, gridY, 1, threads, 1, 1, 0, 0, params, 0) ); + + CUDA_CHECK( cuEventRecord( hStop, NULL ) ); + CUDA_CHECK( cuEventSynchronize( hStop ) ); + CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) ); + totalTime += ms; + repeat -= r; + } + + + CUDA_CHECK( cuModuleUnload(hModule) ); + + return totalTime; +} + +void gflops(const char* ident, int N, float ms, int repeat) +{ + // Standard sgemm flops formula + ms /= repeat; + printf("%s GFLOPS: %.2f (size: %d, iterations: %d)\n", ident, ((double)N * N * N * 2.0 + N * N) / (ms * 1000000.0), N, repeat); +} + +void test(float* C, float* T, int N, size_t size) +{ + // Compare our implementation with the cublas result + int errors = memcmp(C, T, size); + if (errors) + { + if (N <= 512) // This gets too big and slow for large N + { + errors = 0; + FILE* file; + if (fopen_s(&file, "data.txt", "w") == 0) + { + for (int y = 0; y < N; ++y) + { + for (int x = 0; x < N; ++x) + { + float c = C[x*N + y]; + float t = T[x*N + y]; + if (c != t) + { + errors++; + fprintf(file, "%.8f!%.8f\t", c , t); + //fprintf(file, "%.0f!", c); + //fprintf(file, "!"); + } + else + { + //fprintf(file, "%.0f=%.0f\t", c , t); + //fprintf(file, "%.0f=", c); + fprintf(file, "="); + } + } + fprintf(file, "\n"); + } + fclose(file); + printf("%d errors\n", errors); + } + else + { printf("Cannot open data.txt for writing\n"); } + } + else + { printf("%d errors\n", errors); } + } + else + { printf("%d errors\n", errors); } +} \ No newline at end of file diff --git a/Assembler/MaxAs/sgemm/sgemm.cu b/Assembler/MaxAs/sgemm/sgemm.cu new file mode 100644 index 0000000..ce8b2a6 --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm.cu @@ -0,0 +1,105 @@ + +// Note this file isn't configured to automatically compile. +// Here's how: + +// If you want to look at the ptx first: +// nvcc -arch sm_50 -m 32 -ptx sgemm.cu + +// Manually compile your kernel to a cubin. +// You should only have to do this once, unless you change params or shared size or globals: +// nvcc -arch sm_50 -m 32 -cubin sgemm.cu + +// If tweaking a kernel or writing a new one based on this shell code you would then do this: +// maxas.pl -e kernel.cubin kernel.sass + +// I've already included a modified kernel (sgemm.sass) so the next step is.. + +// Splice the manually assembled code back into the cubin: +// maxas.pl -i sgemm.sass sgemm.cubin + +#include +#include +#include +#include + +typedef texture floatTex; + +floatTex texA(0, cudaFilterModePoint, cudaAddressModeBorder); +floatTex texB(0, cudaFilterModePoint, cudaAddressModeBorder); + +// Use extern C so C++ doesn't mangle our kernel name +extern "C" +// This kernel requires 256x1x1 threads per block +__global__ void __launch_bounds__(256) sgemm_kernel_128( + float *C, + const int m, const int n, const int k, + const int lda, const int ldb, const int ldc, + float alpha, int *D) +{ + // Declare any shared memory your kernel requires + // Or you could just pass the amount in as a param to cuLaunchKernel + __shared__ float4 share[1024]; + + int tid = threadIdx.x; + + // If you use indirect texture references, they will be passed as params at the end of the param list + // So set that up here to make sure they're available in your kernel + floatTex tex = tid > 127 ? texB : texA; + + // Make use of shared and your textures so it doesn't get optimized away + share[tid] = tex1Dfetch(tex, tid); + + __syncthreads(); + + // output something so your setup isn't optimized away. + C[tid] = share[255-tid].x; +} + +extern "C" +__global__ void __launch_bounds__(64) sgemm_kernel_64( + float *C, + const int m, const int n, const int k, + const int lda, const int ldb, const int ldc, + float alpha, int *D) +{ + __shared__ float4 share[512]; + + int tid = threadIdx.x; + + floatTex tex = tid > 127 ? texB : texA; + + share[tid] = tex1Dfetch(tex, tid); + + __syncthreads(); + + C[tid] = share[255-tid].x; +} + +// A note about using the Cuda Runtime. +// If that's your preference over the driver API then here's what you'd do: + +// In your project properties in the Cuda C/C++ panel: +// -Set the "Keep Processed Files" (-keep) option +// -Add a -v manually to the command line +// If compiling on command line just add -keep -v options to nvcc. +// Rebuild your solution and look in the log for these lines that follow the ptxas step: + +// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda +// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" +// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" + +// You just need to manually run these 3 commands (or add them to a build script) +// after you've modified the cubin generated from the preceeding ptxas command. +// That will give you a new .cu.obj file which will automatically be linked in for you next time you +// build your project (or you could manually run the linker step as well). + +// Having done that you can call your kernel normally using the <<< >>> syntax. +// Debugging will have to be with the sass syntax but that's what you'll want to see anyway. +// With fatbin you can also keep non-maxwell optimized versions of your code. + + +// I just discovered this also works as a shortcut to the above: +// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu + +// The cu kernel definitions above need to have empty bodies. +// And, the cu file must be compiled to a lib seperately before linking. \ No newline at end of file diff --git a/Assembler/MaxAs/sgemm/sgemm.pl b/Assembler/MaxAs/sgemm/sgemm.pl new file mode 100644 index 0000000..9b1661b --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm.pl @@ -0,0 +1,102 @@ +#!/usr/bin/perl +use strict; + +my $CU_AD_FORMAT_UNSIGNED_INT8 = 0x01; +my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02; +my $CU_AD_FORMAT_FLOAT = 0x20; + +if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9]) +{ + print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`; + exit if $?; + print `maxas.pl -i sgemm128.sass sgemm.cubin`; + exit if $?; + print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`; +} +if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9]) +{ + print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`; + exit if $?; + print `maxas.pl -i sgemm64.sass sgemm.cubin`; + exit if $?; + print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`; +} + +#print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2); + +`Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`; + +print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`; +exit; + +my %data; +foreach my $thread128 (4 .. 64) +{ + my $N = $thread128 * 128; + + my $iterations = int(20 * (64 * 128)**3 / $N**3); + $iterations = 10000 if $iterations > 10000; + + print "$N $iterations\n"; + + my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`; + + foreach my $bench (split "\n", $data) + { + if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /) + { + push @{$data{$N}}, $2; + print "$1 $2\n"; + } + } +} +print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n"; + +foreach my $N (sort { $a <=> $b } keys %data) +{ + print join("\t", @{$data{$N}}), "\n"; +} + + +#print $data; + +__END__ + + +64 * 128 * 16 * 1.620 * .931 / 520 + +Max64 GFLOPS: 1377.38 (size: 256, iterations: 2000) +Max128 GFLOPS: 973.70 (size: 256, iterations: 2000) +Cub64 GFLOPS: 1272.42 (size: 256, iterations: 2000) +Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000) + +my @data = grep /\S/, split "\n", $data; + +my $min; +my %smData; +my @sdata; +foreach (@data) +{ + next if /GFLOPS/; + + my ($sm, $clock, $by, $bx) = split /\s+/; + + $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm}; + + $min = $clock if !$min || $clock < $min; + + push @sdata, [$sm, $clock, $by, $bx]; +} + +foreach (@sdata) +{ + $_->[1] -= $smData{$_->[0]}; +} + +foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata) +{ + printf "%02d %8u by: %2d bx: %2d\n", @$_; + +} + + diff --git a/Assembler/MaxAs/sgemm/sgemm.sln b/Assembler/MaxAs/sgemm/sgemm.sln new file mode 100644 index 0000000..bcbee09 --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32 + {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32 + {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32 + {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Assembler/MaxAs/sgemm/sgemm.vcxproj b/Assembler/MaxAs/sgemm/sgemm.vcxproj new file mode 100644 index 0000000..6d28ced --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm.vcxproj @@ -0,0 +1,92 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + {D571379D-3653-43CB-BE83-A6C68D392A05} + Win32Proj + sgemm + + + + Application + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + true + + + false + + + + + + Level3 + Disabled + _CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) + + + Console + true + $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) + cuda.lib;cublas.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + _CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) + cuda.lib;cublas.lib;%(AdditionalDependencies) + + + + + + + + + + + + + \ No newline at end of file diff --git a/Assembler/MaxAs/sgemm/sgemm128.sass b/Assembler/MaxAs/sgemm/sgemm128.sass new file mode 100644 index 0000000..038d2f3 --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm128.sass @@ -0,0 +1,613 @@ +# Kernel: sgemm_kernel_128 +# +# SharedSize: 16384 +# Params(8): +# 0:0x140:4:4 param_C, +# 1:0x144:4:0 param_m, +# 2:0x148:4:0 param_n, +# 3:0x14c:4:0 param_k, +# 4:0x150:4:0 param_lda, +# 5:0x154:4:0 param_ldb, +# 6:0x158:4:0 param_ldc +# 7:0x15c:4:0 param_alpha +# 8:0x160:4:4 param_D // for diagnostic printf output +# +# Globals: +# c[0x0][0x164]: texA (the value is 1) +# c[0x0][0x168]: texB (the value is 0) + + + + // Temporary registers to calculate the state registers. Reuse the C output registers. + // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts. + 0-63 ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy + + // Aliases for the C registers we use for initializing C (used as vectors) + 0-63 : cz<00-63> + + // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers + 80 : zOffset + + // 64 C maxtrix output registers. + // Use special mapping to avoid register bank conflicts between these registers and the blocking registers. + 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> + 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> + 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> + 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> + 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> + + // Double buffered register blocking used in vector loads. + // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags + 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> + 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> + + // Registers to load A or B + 96-103 : loadX<0-7> + + // Key global state registers for main loop and some we reuse for outputing C. + // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of + // delayed bank conflicts between memory operations and ffmas. + // The array index bracket notation can be used to request a bank in a dynamically allocated range. + 104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs + + // Registers to store the results back to global memory. Reuse any register not needed after the main loop. + // Statically allocate cs0-7 because they're vector registers. + 64-71 : cs<0-7> + + // dynamically allocated C output registers(~) + 72-103 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX + + + +// Note the absense of the loading of the stack pointer into R1. +// No idea why ptxas does that anyway when it's not used for register spilling. +// Such a waste of a perfectly good register. + +// Scheduler doesn't handle the dependency flags yet, +// so move these first instructions outside the block that's auto scheduled +//--:-:-:-:1 CS2R clock, SR_CLOCKLO; +//--:-:-:-:1 S2R smId, SR_VIRTID; +//--:-:-:-:1 S2R nSMs, SR_VIRTCFG; +--:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 +--:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 +--:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 + +// Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies +// Memory dependencies are left up to the auther to deal with manually for now. + + +// First 128 threads load A to shared, 2nd 128 loads B to shared +// Note this technique is not possible in cuda or ptx as there's no way to +// efficiently specify a warp-uniform predicate for a memory op. +// Compile sgemm.cu and inspect the sass to see what I'm talking about. + +// blk = tid >= 128 ? by : bx; +// ldx = tid >= 128 ? ldb : lda; +// tex = tid >= 128 ? texB : texA; +01:-:-:Y:1 ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1 +06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 +--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150]; +--:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; +--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA +--:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB + +// Initialize the portion of shared we use to zero our C registers +// Give each warp its own address to write to. +// All threads write to the same address, but we don't care because only one needs to take. +// There is no bank conflict on writing to the same address, just indeterminacy in which thread will get its value stored. +--:-:-:-:1 LOP.AND zOffset, tid, -32; +--:-:-:-:1 STS.128 [zOffset + 4x<16*128>], RZ; + +// tid4 = (tid >> 5) & 3 +// tid31 = tid & 31 +// tid96 = tid & 96 +// tid128 = tid & 128 +--:-:-:-:1 BFE.U32 tid4, tid, 0x205; // 2 bits at position 5 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 LOP.AND tid128, tid, 128; + +// ldx4 = ldx * 4; +// ldx8 = ldx * 8; +--:-:-:-:1 SHR.U32 ldx, ldx4, 2; +--:-:-:-:1 IADD ldx8, ldx4, ldx4; + +// track0 = blk*128/4 + tid31 + (ldx * tid4) +--:-:-:-:1 ISCADD track0, blk, tid31, 5; +--:-:-:-:1 XMAD.LO track0, ldx, tid4, track0, xmad_t0; // XMAD.LO is a macro that is expanded out into the 3 XMADs +--:-:-:-:1 IADD track4, track0, ldx4; + +// writeS = tid31*4*4 + tid4*128*4 +// writeS += 4096 if tid >= 128 +--:-:-:-:1 SHL tid31_4, tid31, 4; +--:-:-:-:1 ISCADD writeS, tid4, tid31_4, 9; +--:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*128>; + +// int end = track0 + (k-8)*ldx; +--:-:-:-:1 MOV k, c[0x0][0x14c]; +--:-:-:-:1 IADD k, k, -8; +--:-:-:-:1 XMAD.LO end, k, ldx, track0, xmad_end; + +// readAs and readBs are carefully constructed to avoid any bank conflicts while loading from shared +// readAs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readAs, tid128, 4; +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readBs, tid, 0x70; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<8*128>, 4; + +// Preload the first 8 lines from texture memory +// Keep these instructions in this order (but allow others to interleave). +// Normally the scheduler tries to preserve source order by default, but this demonstrates how you enforce +// an ordering if you need to. +// Note: these are the 4 element vector load versions (last param: 0xf=vec4, 0x3=vec2, 0x1=single) + +--:-:1:-:1 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:2:-:1 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2 + + + + +// Initialize C registeres to zero +// Using LDS.U.128 is a neat trick to save a few clock cyles +// (when you have enough warps to hide the latency.) + + return join '', map sprintf("--:-:3:-:1 LDS.U.128 cz%02d, [zOffset + 4x<16*128>];\n", $_ * 4), 0..15; + + +// These instuctions need to occur after the textures load so put them in a new block +// that starts with a dependency barrier wait. + + +01:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1 +02:-:-:-:1 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2 + +// Increment tracks after the loads are complete to avoid needing write-after-read dependencies +--:-:-:-:1 IADD track0, track0, ldx8; +--:-:-:-:1 IADD track4, track4, ldx8; + +// Wait for all threads to finish loading shared +04:-:-:-:5 BAR.SYNC 0; + + + +// The next store to shared goes to high area. +// Having 2 share buffers allows us to eliminate a bar.sync in the main loop. +// This way we don't have to wait for all threads to arrive before writing fresh data to shared. +// Other threads can continue reading from the last batch while the new data is being written. +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*128>; + +// Preload the fist lines of A and B from shared +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1 + + +// The main loop +// While calculating the first line, load in the next line from shared. +// Shared memory stores enough to do this 8 times per loop. +// Also pull in the next block of memory from global and store it to shared. + +// Efficiency: +// ffma: 512 +// lds: 32 dual issued +// sts: 2 dual issued +// tex: 2 dual issued +// add: 2 +// xor: 3 +// setp: 1 +// bar: 1 dual issued +// bra: 1 dual issued +// Total: 524 (512/518 = 98.8% FFMA) + +// Memory Throughput Upper Bound: +// 2 * 4 * 4 bytes per thread per 518 clocks +// 128 threads per SM +// 16 SM's (GM204) +// 1640Mhz (boost overclock) +// .931 GiB/GB (1000^3 / 1024^3) +// 193 GiB/sec +// Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz) + +LOOP: + +// Loop end condition +--:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; + + + + # We eliminated bank conflicts with our C registers and the blocking registers, + # but there are still 16 bank conflicts between the blocking registers themselves. + # By ordering the FFMA's in a swirling zigzag pattern we can completely hide those conflicts + # behind register reuse. This pattern also maximizes that reuse (47%) and minimizes the bandwidth + # out of the register bank, thereby reducing power consumption and allowing the chip to + # stay at a higher sustained clock speed. One other constraint is that we want each successive + # instruction to pull its third operand from alternating banks. We space the swirl by 2 in the x + # direction to achieve this. This has the effect of making it easier to avoid delayed bank conflicts + # with the memory operations. Finally, for the very first ffma, don't choose one of the 16 bank conflicts + # as we have no way of hiding that conflict behind a reuse (cublas makes this mistake). + + # Alternating banks (1320 Hz, full speed) + my @swirl = ([2,0],[2,1],[0,1],[0,0]); + my @xVals = (0,1,64,65); + + # Repeating banks (1320Hz, 83 Gflops slower, but lower power draw probably becuase of increased stalls) + # Only explanation I can think of is increased delayed register bank conflicts with memory ops. + #my @swirl = ([0,1],[0,0],[1,0],[1,1]); + #my @xVals = (0,2,64,66); + + my @cOrder; + foreach my $y (0,2,64,66) + { + # apply the swirl + foreach my $x (@xVals) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + # apply the zigzag + @xVals = reverse @xVals; + } + + # This ordering (a simple zigzag) eliminates the bank conflicts but only achieves 39% reuse. + # It runs 20 GFlops slower since the register bank draws more power and the clock slows down to 1306 Hz. + # There may be more delayed bank conflicts with memory operations as the slowdown is 4 Glops more than + # the reduced clock accounts for. + #my @cOrder2; + #my @xVals = (0..3,64..67); + #foreach my $y (0..3,64..67) + #{ + # @xVals = reverse @xVals; + # push @cOrder2, [$_, $y] foreach @xVals; + #} + #@cOrder = @cOrder2; + + my %insert = + ( + # Don't start the first TLD before 12 to let ISETP to write P0 + # These global reads and shared writes we put exactly in the middle of the LDS ops + # This is to not overwhelm the memory units with instructions (and because these were tested faster here). + # The 4 spacing seems to work best for vec4 instructions. + # It's odd that these two textures loads can drive 512 FFMA's all by themselves.. but 256 threads can load 8 128 F32 wide lines. + # So we only need 2 to get 8 lines from both matrices. + + j0c31 => "--:-:2:-:1 \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2\n", + j0c33 => "--:-:3:-:1 \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3\n", + + j6c30 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2\n", + j6c34 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3\n", + + # We need one barrier in the main loop after writing shared memory. + # The barrier is needed even if this is our last loop because we need to protect the warp shuffle step. + # Note, BAR.SYNCs do not sync memory read access automatically, you still need to flag the barriers (writes are sync'd). + # After the BAR, swap our share buffer location. We don't need an additional barrier because of these swaps. + # Note, this doubles our shared memory usage but this kernel's occupancy is entirely bound by registers. + # LOP.XOR readAs needs to be 4 clocks prior to the LDS.U.128 for readAs (but push this as far down as possible) + j6c62 => + "01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<16*128>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<16*128>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<16*128>;\n", + + # Note having 2 IADDs slightly hits our FFMA performance (1/518 = .2%), but TLD doesn't take an offset. + # LDG.CI doesn't have this issue, but doesn't give you the nice features of texture loads: + # -Boundry Clamping: simplifies our matrix load logic so we don't need to worry about loading out of bounds + # -Normalized Floats: if we don't need full 32 bits of precision we could store our matrices using 16 or 8 bit values + j7c63 => + "--:-:-:-:1 \@P0 IADD track0, track0, ldx8;\n" . + "--:-:-:-:0 \@P0 IADD track4, track4, ldx8;\n" . + "--:-:-:Y:5 \@P0 BRA LOOP;\n", + ); + + my $out; + # We unroll our main loop 8 iterations. + # This gives us a loop instruction count of 556. Add the control instructions and that makes it 741 opcodes sized 8 bytes. + # This is 5928 bytes, nicely fitting inside the 8kb instruction cache. Going to the next biggest size would be 12 lines. + # That would be 768 ffmas and not leaving enough room for the other instructions and control codes. + # So by staying inside the instruction cache size, we avoid hitting any instruction fetch latencies. + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + # Our rolling blocking registers stay one load ahead off the FFMA's (rs: read share) + my $rsOffset = ($j + 1) % 8; + # No need to load on last loop iteration + my $rsPred = $j == 7 ? '@P0' : ' '; + + # You can experiment here with different vector load sizes + my $vec = 128; + + if ($vec == 128) + { + # Roll up our LDS ops here to keep them easier to manage and tune + # Space at every other clock to maximize throughput. + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBy64, [readBs + 4x<%d*128 + 64>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; + } + elsif ($vec == 64) + { + # LDS.64 runs about 22 Gflops slower than LDS.128 (GM107). Not a huge difference since our latencies are so well hidden. + # I think LDS.128 is implemented internally as a pair of LDS.64 ops which could be another reason for the comparable performance. + # I think the big benefit with 128 is being able to issue all our LDS ops earlier, allowing more FFMA's prior to reading out the results. + # There could also be additional opportunity for delayed bank conflicts. + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c8"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c10"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c12"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c14"} = sprintf "--:-:1:-:1 %s LDS.U.64 j%dBy66, [readBs + 4x<%d*128 + 66>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; + } + else + { + # This one drops performance by over 200 Gflops. So you want to at least use LDS.64 if you can. + # We don't even have room to properly space these at half throuput. + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c1"} = sprintf "--:-:-:-:1 %s LDS j%dAx01, [readAs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c3"} = sprintf "--:-:-:-:1 %s LDS j%dAx03, [readAs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c5"} = sprintf "--:-:-:-:1 %s LDS j%dBy01, [readBs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:-:-:1 %s LDS j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c7"} = sprintf "--:-:-:-:1 %s LDS j%dBy03, [readBs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c8"} = sprintf "--:-:-:-:1 %s LDS j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c9"} = sprintf "--:-:-:-:1 %s LDS j%dAx65, [readAs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c10"} = sprintf "--:-:-:-:1 %s LDS j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c11"} = sprintf "--:-:-:-:1 %s LDS j%dAx67, [readAs + 4x<%d*128 + 67>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c12"} = sprintf "--:-:-:-:1 %s LDS j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c13"} = sprintf "--:-:-:-:1 %s LDS j%dBy65, [readBs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c14"} = sprintf "--:-:-:-:1 %s LDS j%dBy66, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c15"} = sprintf "--:-:1:-:1 %s LDS j%dBy67, [readBs + 4x<%d*128 + 67>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; + } + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + # Grab an instruction for insertion if one exists for this j and c combination + my $ins = $insert{"j${j}c$c"} || ''; + + # Scatter some yields in there to better balance the workload and reduce sync stalls + # Don't pair a yeild with the dual issued ffmas as that kills performance for some reason + ##### This no longer offers extra performance on GM204 as it did on GM107. It still does for the 64 thread version. Keeping since it doesn't hurt. #### + my $yield = $c == 32 ? 'Y' : '-'; + + # The first FFMA needs to wait on the prior loop's LDS.U.128 ops to finish (except if the barrier does the wait for us) + my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--',''); + + # Dual issue these ops + my $stall = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + # output our FFMA and also any inserted ops + $out .= sprintf "%s FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $comment, $ins; + } + } + return $out; + + + +// Main loop is done, time to write C to global memory. + + +// Remove the high bits if present from the last loop's xor. +// Also remove the 4096 added onto readBs. +// This gives us the x and y coordinates of the start of this thread's data in C. +--:-:-:-:1 LOP.AND readAs, readAs, 0xfff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xfff; + +// Remap readAs and readBs onto writeCs so we can shuffle the output for coalesced global writes. +// readAs stays constant, readBs colapses down from stride 4 to 1 +// writeCs = (readBs / 4) * 128 + readAs; +--:-:-:-:1 ISCADD writeCs, readBs, readAs, 5; + +// Read out the C values from shared in a simple tid mapped pattern but +// offset by the position of this warp's colapsed data in shared. + +// cx = tid31 | (tid128 >> 2); +--:-:-:-:1 SHR.U32 cx, tid128, 2; +--:-:-:-:1 LOP.OR cx, tid31, cx; + +// readCs = ((tid96 << 4) | cx) << 2; +--:-:-:-:1 SHL readCs, tid96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, cx; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx += bx*128; +--:-:-:-:1 ISCADD cx, bx, cx, 7; + +// cy = by*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, by, cy00, 7; + +// C += (cy*ldc + cx) * 4; +--:-:-:-:1 MOV ldc, c[0x0][0x158]; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_ci; +--:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; + +// When writing in assembly, being able to 'printf' is sometimes easier than stepping through the debugger. +// Here's how it's done. Drop something like this in your code. Then modify the c code to accept this +// many params per thread to printf (see assemblySgemm function). + +//--:-:-:-:1 SHR.U32 smId, smId, 20; + +// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4 +// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4 +//--:-:-:-:1 MOV gridDimX, c[0x0][0x14]; +//--:-:-:-:1 MOV blckDimX, c[0x0][0x8]; +//--:-:-:-:1 XMAD.LO D, by, gridDimX, bx, xmad_D; +//--:-:-:-:1 XMAD.LO D, D, blckDimX, tid, xmad_D; +//--:-:-:-:1 ISCADD D, D, c[0x0][0x160], 3; // 4 bytes * 2 vars = 8 or shift 3 + +//--:-:-:-:1 STG.CS [D + 4x<0>], readAs; +//--:-:-:-:1 STG.CS [D + 4x<1>], readBs; +//--:-:-:-:1 STG.CS [D + 4x<2>], writeCs; +//--:-:-:-:1 STG.CS [D + 4x<3>], readCs; +//--:-:-:-:1 STG.CS [D + 4x<4>], cx; +//--:-:-:-:1 STG.CS [D + 4x<5>], cy00; +//--:-:-:-:1 STG.CS [D + 4x<6>], ci; +//--:-:-:-:1 STG.CS [D + 4x<7>], cx67y67; + +//--:-:-:-:1 STG.CS [D + 4x<0>], smId; +//--:-:-:-:1 STG.CS [D + 4x<1>], clock; + + +// Setup our matrix bounds checking vars and preds +// Bounds checking is what allows this code to work on matrix sizes not a multiple of 128 +--:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m +--:-:-:-:1 IADD cx, cx, 64; +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m + +--:-:-:-:1 IADD cy00, cy00, -1; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD cy12, cy00, 12; + +// Setup our C output addresses and increments. +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 SHL ldc8, ldc, 5; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + +// Load the first set of the STORE_C subroutine params in the scheduled block. +# This is also a good time to apply alpha. +--:-:-:-:1 MOV alpha, c[0x0][0x15c]; + +--:-:-:-:1 FMUL cs0, cx00y00, alpha; +--:-:-:-:1 FMUL cs1, cx01y00, alpha; +--:-:-:-:1 FMUL cs2, cx02y00, alpha; +--:-:-:-:1 FMUL cs3, cx03y00, alpha; +--:-:-:-:1 FMUL cs4, cx64y00, alpha; +--:-:-:-:1 FMUL cs5, cx65y00, alpha; +--:-:-:-:1 FMUL cs6, cx66y00, alpha; +--:-:-:-:1 FMUL cs7, cx67y00, alpha; + +// We pre-increment the output addresses so they can be dual issued with memory ops +// So start with a -1 instead of 0 value. +--:-:-:-:1 IADD Cy00, Cy00, -ldc1; +--:-:-:-:1 IADD Cy04, Cy00, ldc4; +--:-:-:-:1 IADD Cy08, Cy00, ldc8; +--:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) + + + +// There's nothing yet in place to handle dependecies with subroutines. +// So don't schedule this block. + + + my $out; + foreach my $y (0..3, 64..67) + { + my ($wait, $comment) = $y == 64 ? ('--', '') : ('02',' // Wait Dep 2'); + + # Jump ahead 60 units (to get to the values at y=64) + $out .= + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n\n" . + + "02:-:-:-:1 IADD Cy00, Cy00, ldc60; // Wait Dep 2\n" . + "--:-:-:-:1 IADD Cy04, Cy04, ldc60;\n" . + "--:-:-:-:1 IADD Cy08, Cy08, ldc60;\n" . + "--:-:-:-:1 IADD Cy12, Cy12, ldc60;\n\n" if $y == 64; + + # We need to move the C values to the param registers of the STORE_C subroutine. + # This is also a good time to apply alpha. + $out .= sprintf( + "%s:-:-:-:1 FMUL cs0, cx00y%02d, alpha;%s\n" . + "--:-:-:-:1 FMUL cs1, cx01y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs2, cx02y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs3, cx03y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs4, cx64y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs5, cx65y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs6, cx66y%02d, alpha;\n" . + "--:-:-:-:0 FMUL cs7, cx67y%02d, alpha; // Dual Issue\n", + $wait, $y, $comment, ($y) x 7) if $y; + + # Call the subroutine. + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +// And we'd done. The remainder is the STORE_C subroutine that's defined at the end of the kernel. +--:-:-:-:5 EXIT; + +// This routine does warp synchronous shuffling of our output data so as to be able +// to have coalesced writes to global memory. This is actually faster because the shared +// memory latencies can be hidden by other warps and we're only adding a few extra clocks +// to this thread. Global memory here is the bottleneck and being able to half the needed +// bandwidth at the expense of a few clocks is a modest win. This also keeps power lower +// and our chip running faster. + +// Note, the SHFL instruction doesn't help us here because we're swaping different registers +// from different threads. +STORE_C: + + + +// Each warp writes to its own region of memory so we don't need to bar.sync the access. +// There are some bank conflicts here on the STS.128s but no way to avoid them, and the hit just means a few extra clocks. +// Note here that the scheduler is able to handle the dependencies between vector and non-vector instructions. +// It knows from the instruction type and the register map that cs0 here includes cs1, cs2 and cs3 as well. +--:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], cs4; + +// In a single warp, loads naturally occur after the store to shared completes, no sync required. +--:-:-:-:1 LDS cs0, [readCs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS cs1, [readCs + 4x<0*128 + 64>]; +--:-:-:-:1 LDS cs2, [readCs + 4x<1*128 + 00>]; +--:-:-:-:1 LDS cs3, [readCs + 4x<1*128 + 64>]; +--:-:-:-:1 LDS cs4, [readCs + 4x<2*128 + 00>]; +--:-:-:-:1 LDS cs5, [readCs + 4x<2*128 + 64>]; +--:-:-:-:1 LDS cs6, [readCs + 4x<3*128 + 00>]; +--:-:1:-:1 LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1 + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; + +--:-:-:-:1 IADD Cy00, Cy00, ldc1; +--:-:-:-:1 IADD Cy04, Cy04, ldc1; +--:-:-:-:1 IADD Cy08, Cy08, ldc1; +--:-:-:-:1 IADD Cy12, Cy12, ldc1; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m + +01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 +--:-:-:-:1 @P1 STG.CG [Cy00 + 4x<64>], cs1; +--:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; +--:-:-:-:1 @P3 STG.CG [Cy04 + 4x<64>], cs3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m + +--:-:-:-:1 @P0 STG.CG [Cy08 + 4x<00>], cs4; +--:-:-:-:1 @P1 STG.CG [Cy08 + 4x<64>], cs5; +--:-:-:-:1 @P2 STG.CG [Cy12 + 4x<00>], cs6; +--:2:-:-:1 @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2 + + + +--:-:-:-:5 RET; + diff --git a/Assembler/MaxAs/sgemm/sgemm64.sass b/Assembler/MaxAs/sgemm/sgemm64.sass new file mode 100644 index 0000000..f037b3e --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm64.sass @@ -0,0 +1,398 @@ +# Kernel: sgemm_kernel_64 +# +# SharedSize: 8192 +# Params(8): +# 0:0x140:4:4 param_C, +# 1:0x144:4:0 param_m, +# 2:0x148:4:0 param_n, +# 3:0x14c:4:0 param_k, +# 4:0x150:4:0 param_lda, +# 5:0x154:4:0 param_ldb, +# 6:0x158:4:0 param_ldc +# 7:0x15c:4:0 param_alpha +# 8:0x160:4:4 param_D // for diagnostic printf output +# +# Globals: +# c[0x0][0x164]: texA (the value is 1) +# c[0x0][0x168]: texB (the value is 0) + + + + 0-63 ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end + + 80 : zOffset + 0-63 : cz<00-63> + + 3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35> + 35,34,43,42,51,50,59,58 : cx32y<00-03|32-35> + 39,38,47,46,55,54,63,62 : cx33y<00-03|32-35> + 33,32,41,40,49,48,57,56 : cx34y<00-03|32-35> + 37,36,45,44,53,52,61,60 : cx35y<00-03|32-35> + + 64-79 : j0Ax<00-03|32-35>, j0By<00-03|32-35> + 80-95 : j1Ax<00-03|32-35>, j1By<00-03|32-35> + + 64-71 : cs<0-7> + + 96-111 : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3> + + 112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32 + + 72-111 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX + + + +--:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 +--:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 +--:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 + + + +// blk = tid >= 32 ? by : bx; +// ldx = tid >= 32 ? ldb : lda; +// tex = tid >= 32 ? texB : texA; +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1 +06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 +--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150]; +--:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; +--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA +--:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB + +--:-:-:-:1 LOP.AND zOffset, tid, -32; +--:-:-:-:1 STS.128 [zOffset + 4x<16*64>], RZ; + +// tid2 = (tid >> 4) & 1 +// tid15 = tid & 15 +// tid31 = tid & 31 +// tid32 = tid & 32 +--:-:-:-:1 BFE.U32 tid2, tid, 0x104; // 1 bit at position 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid32, tid, 32; + +// ldx4 = ldx * 4; +// ldx8 = ldx * 8; +--:-:-:-:1 SHR.U32 ldx, ldx4, 2; +--:-:-:-:1 IADD ldx8, ldx4, ldx4; + +// track0 = blk*64/4 + tid15 + (ldx * tid2) +--:-:-:-:1 ISCADD track0, blk, tid15, 4; +--:-:-:-:1 XMAD.LO track0, ldx, tid2, track0, xmad_t0; +--:-:-:-:1 IADD3 track2, track0, ldx, ldx; +--:-:-:-:1 IADD track4, track0, ldx4; +--:-:-:-:1 IADD track6, track2, ldx4; + +// writeS = tid15*4*4 + tid2*64*4 +--:-:-:-:1 SHL tid15_4, tid15, 4; +--:-:-:-:1 ISCADD writeS, tid2, tid15_4, 8; + +// writeS += 2048 if tid >= 32 +--:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*64>; + +// int end = track0 + (k-8)*ldx; +--:-:-:-:1 MOV k, c[0x0][0x14c]; +--:-:-:-:1 IADD k, k, -8; +--:-:-:-:1 XMAD.LO end, k, ldx, track0, xmad_end; + +// readAs = ((tid >> 1) & 7) << 4; +--:-:-:-:1 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 2048; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readBs, tid, 0x30; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<8*64>, 4; + + +--:-:1:-:1 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:2:-:1 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:3:-:1 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:4:-:1 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2 + + + + + + return join '', map sprintf("--:-:5:-:1 LDS.U.128 cz%02d, [zOffset + 4x<16*64>];\n", $_ * 4), 0..15; + + + + +01:-:-:-:1 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1 +02:-:-:-:1 STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2 +04:-:-:-:1 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3 +08:-:-:-:1 STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4 + +--:-:-:-:1 IADD track0, track0, ldx8; +--:-:-:-:1 IADD track2, track2, ldx8; +--:-:-:-:1 IADD track4, track4, ldx8; +--:-:-:-:1 IADD track6, track6, ldx8; + +10:-:-:-:5 BAR.SYNC 0; + + + +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*64>; + +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1 + +// Efficiency: +// ffma: 512 +// lds: 32 dual issued +// sts: 4 dual issued +// tex: 4 dual issued +// add: 4 +// xor: 3 +// setp: 1 +// bar: 1 dual issued +// bra: 1 dual issued +// Total: 520 (512/520 = 98.5% FFMA) + +LOOP: + +// Loop end condition +--:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; + + + + my @cOrder; + my @swirl = ([2,0],[2,1],[0,1],[0,0]); + my @x = (0,1,32,33); + foreach my $y (0,2,32,34) + { + foreach my $x (@x) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @x = reverse @x; + } + + my %insert = + ( + j0c31 => "--:-:-:-:1 \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;\n", + j0c33 => "--:-:2:-:1 \@P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2\n", + + j1c31 => "--:-:-:-:1 \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;\n", + j1c33 => "--:-:3:-:1 \@P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3\n", + + j5c30 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2\n", + j5c34 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<2*64>], loadX2;\n", + + j6c30 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3\n", + j6c34 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*64>], loadX6;\n", + + j6c62 => + "01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<16*64>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<16*64>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<16*64>;\n", + + j7c63 => + "--:-:-:-:1 \@P0 IADD track0, track0, ldx8;\n" . + "--:-:-:-:1 \@P0 IADD track2, track2, ldx8;\n" . + "--:-:-:-:1 \@P0 IADD track4, track4, ldx8;\n" . + "--:-:-:-:0 \@P0 IADD track6, track6, ldx8;\n" . + "--:-:-:Y:5 \@P0 BRA LOOP;\n", + ); + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx00, [readAs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBy00, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx32, [readAs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBy32, [readBs + 4x<%d*64 + 32>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $yield = $c == 32 ? 'Y' : '-'; + + my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--',''); + + my $stall = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $comment, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 LOP.AND readAs, readAs, 0x7ff; +--:-:-:-:1 LOP.AND readBs, readBs, 0x7ff; + +// writeCs = (readBs / 4) * 64 + readAs; +--:-:-:-:1 ISCADD writeCs, readBs, readAs, 4; + +// readCs = ((tid32 << 3) + tid31) << 2; +--:-:-:-:1 ISCADD readCs, tid32, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = bx*64 + tid31; +--:-:-:-:1 ISCADD cx, bx, tid31, 6; + +// cy = by*64 + (tid32 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid32, 1; +--:-:-:-:1 ISCADD cy00, by, cy00, 6; + +// C += (cy*ldc + cx) * 4; +--:-:-:-:1 MOV ldc, c[0x0][0x158]; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_ci; +--:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; + +--:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m +--:-:-:-:1 IADD cx, cx, 32; +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m + +// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4 +// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4 +//--:-:-:-:1 MOV gridDimX, c[0x0][0x14]; +//--:-:-:-:1 MOV blckDimX, c[0x0][0x8]; +//--:-:-:-:1 XMAD.LO D, by, gridDimX, bx, xmadD; +//--:-:-:-:1 XMAD.LO D, D, blckDimX, tid, xmadD; +//--:-:-:-:1 ISCADD D, D, c[0x0][0x160], 5; // 4 bytes * 8 vars = 32 or shift 5 + +//--:-:-:-:1 STG.CS [D + 4x<0>], readAs; +//--:-:-:-:1 STG.CS [D + 4x<1>], readBs; +//--:-:-:-:1 STG.CS [D + 4x<2>], writeCs; +//--:-:-:-:1 STG.CS [D + 4x<3>], readCs; +//--:-:-:-:1 STG.CS [D + 4x<4>], cx; +//--:-:-:-:1 STG.CS [D + 4x<5>], cy00; +//--:-:-:-:1 STG.CS [D + 4x<6>], ci; +//--:-:-:-:1 STG.CS [D + 4x<7>], cx35y35; + +--:-:-:-:1 IADD cy00, cy00, -1; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD cy12, cy00, 12; + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 SHL ldc8, ldc, 5; +--:-:-:-:1 ISCADD ldc28, ldc, -ldc4, 7; + +--:-:-:-:1 MOV alpha, c[0x0][0x15c]; +--:-:-:-:1 FMUL cs0, cx00y00, alpha; +--:-:-:-:1 FMUL cs1, cx01y00, alpha; +--:-:-:-:1 FMUL cs2, cx02y00, alpha; +--:-:-:-:1 FMUL cs3, cx03y00, alpha; +--:-:-:-:1 FMUL cs4, cx32y00, alpha; +--:-:-:-:1 FMUL cs5, cx33y00, alpha; +--:-:-:-:1 FMUL cs6, cx34y00, alpha; +--:-:-:-:1 FMUL cs7, cx35y00, alpha; + +--:-:-:-:1 IADD Cy00, Cy00, -ldc1; +--:-:-:-:1 IADD Cy04, Cy00, ldc4; +--:-:-:-:1 IADD Cy08, Cy00, ldc8; +--:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) + + + + + + my $out; + foreach my $y (0..3, 32..35) + { + my ($wait, $comment) = $y == 32 ? ('--', '') : ('02',' // Wait Dep 2'); + + $out .= + "--:-:-:-:1 IADD cy00, cy00, 28;\n" . + "--:-:-:-:1 IADD cy04, cy04, 28;\n" . + "--:-:-:-:1 IADD cy08, cy08, 28;\n" . + "--:-:-:-:1 IADD cy12, cy12, 28;\n\n" . + + "02:-:-:-:1 IADD Cy00, Cy00, ldc28; // Wait Dep 2\n" . + "--:-:-:-:1 IADD Cy04, Cy04, ldc28;\n" . + "--:-:-:-:1 IADD Cy08, Cy08, ldc28;\n" . + "--:-:-:-:1 IADD Cy12, Cy12, ldc28;\n\n" if $y == 32; + + $out .= sprintf( + "%s:-:-:-:1 FMUL cs0, cx00y%02d, alpha;%s\n" . + "--:-:-:-:1 FMUL cs1, cx01y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs2, cx02y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs3, cx03y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs4, cx32y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs5, cx33y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs6, cx34y%02d, alpha;\n" . + "--:-:-:-:0 FMUL cs7, cx35y%02d, alpha; // Dual Issue\n", + $wait, $y, $comment, ($y) x 7) if $y; + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + + +--:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], cs4; + +--:-:-:-:1 LDS cs0, [readCs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS cs1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:1 LDS cs2, [readCs + 4x<1*64 + 00>]; +--:-:-:-:1 LDS cs3, [readCs + 4x<1*64 + 32>]; +--:-:-:-:1 LDS cs4, [readCs + 4x<2*64 + 00>]; +--:-:-:-:1 LDS cs5, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 LDS cs6, [readCs + 4x<3*64 + 00>]; +--:-:1:-:1 LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1 + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; + +--:-:-:-:1 IADD Cy00, Cy00, ldc1; +--:-:-:-:1 IADD Cy04, Cy04, ldc1; +--:-:-:-:1 IADD Cy08, Cy08, ldc1; +--:-:-:-:1 IADD Cy12, Cy12, ldc1; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m + +01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 +--:-:-:-:1 @P1 STG.CG [Cy00 + 4x<32>], cs1; +--:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; +--:-:-:-:1 @P3 STG.CG [Cy04 + 4x<32>], cs3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m + +--:-:-:-:1 @P0 STG.CG [Cy08 + 4x<00>], cs4; +--:-:-:-:1 @P1 STG.CG [Cy08 + 4x<32>], cs5; +--:-:-:-:1 @P2 STG.CG [Cy12 + 4x<00>], cs6; +--:2:-:-:1 @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2 + + + +--:-:-:-:5 RET; + diff --git a/Assembler/MaxAs/sgemm/sgemm_final_128.sass b/Assembler/MaxAs/sgemm/sgemm_final_128.sass new file mode 100644 index 0000000..ce7b0e7 --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm_final_128.sass @@ -0,0 +1,793 @@ +# Kernel: sgemm_kernel_128 +# Arch: sm_50 +# InsCnt: 770 +# RegCnt: 118 +# SharedSize: 16384 +# BarCnt: 1 +# Params(9): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 +# 3:0x14c:4:0 +# 4:0x150:4:0 +# 5:0x154:4:0 +# 6:0x158:4:0 +# 7:0x15c:4:0 +# 8:0x160:4:0 +# +# Instructions: + +--:-:1:-:1 S2R R112, SR_TID.X; +--:-:2:-:1 S2R R113, SR_CTAID.X; +--:-:3:-:1 S2R R114, SR_CTAID.Y; +01:-:-:Y:1 ISETP.GE.AND P0, PT, R112.reuse, 0x80, PT; +--:-:-:-:1 LOP.AND R117, R112.reuse, 0x1f; +--:-:-:-:1 BFE.U32 R9, R112.reuse, 0x205; +--:-:-:-:1 MOV R13, c[0x0][0x14c]; +--:-:-:-:1 BFE.U32 R4, R112.reuse, 0x301; +--:-:-:-:1 LOP.AND R115, R112.reuse, 0x80; +--:-:-:-:1 LOP.AND R107, R112.reuse, 0x70; +--:-:-:-:1 SHL R16, R117, 0x4; +--:-:-:-:1 LOP.AND R0, R112.reuse, 0x1; +--:-:-:-:1 IADD R13, R13, -0x8; +--:-:-:-:1 LOP.AND R80, R112.reuse, -0x20; +--:-:-:-:1 SHR.U32 R106, R115, 0x4; +--:-:-:-:1 LOP.AND R116, R112, 0x60; +--:-:-:-:1 SHR.U32 R107, R107, 0x3; +--:-:-:-:0 @!P0 MOV R1, c[0x0][0x150]; +--:-:-:-:1 STS.128 [R80+0x2000], RZ; +--:-:-:-:1 @P0 MOV R1, c[0x0][0x154]; +--:-:-:-:1 ISCADD R111, R9, R16, 0x9; +06:-:-:-:1 SEL R12, R114, R113, P0; +--:-:-:-:1 @!P0 MOV32I R110, 0x80000001; +--:-:-:-:1 @P0 MOV32I R110, 0x80000000; +--:-:-:-:1 LOP.OR R106, R106, R4; +--:-:-:-:1 SHR.U32 R8, R1.reuse, 0x2; +--:-:-:-:1 LOP.OR R107, R107, R0; +--:-:-:-:1 ISCADD R104, R12, R117, 0x5; +--:-:-:-:1 IADD R109, R1, R1; +--:-:-:-:1 @P0 IADD R111, R111, 0x1000; +--:-:-:-:1 SHL R106, R106, 0x4; +--:-:-:-:1 XMAD.MRG R5, R8.reuse, R9.H1.reuse, RZ; +--:-:-:-:1 ISCADD R107, R107, 0x1000, 0x4; +--:-:-:-:1 XMAD R104, R8.reuse, R9, R104; +--:-:-:Y:5 XMAD.MRG R20, R13.reuse, R8.H1.reuse, RZ; +--:-:-:-:2 XMAD.PSL.CBCC R104, R8.H1, R5.H1, R104; +--:-:1:-:4 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf; +--:-:-:-:1 IADD R108, R104, R1; +--:-:-:-:1 XMAD R105, R13.reuse, R8, R104; +--:-:2:Y:5 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf; +--:-:-:-:1 XMAD.PSL.CBCC R105, R13.H1, R20.H1, R105; +--:-:3:-:1 LDS.U.128 R0, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R4, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R8, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R12, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R16, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R20, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R24, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R28, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R32, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R36, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R40, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R44, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R48, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R52, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R56, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R60, [R80+0x2000]; +01:-:-:-:1 STS.128 [R111], R96; +--:-:-:-:0 IADD R104, R104, R109.reuse; +02:-:-:-:1 STS.128 [R111+0x800], R100; +--:-:-:-:0 IADD R108, R108, R109; +04:-:-:-:5 BAR.SYNC 0x0; +--:-:-:-:0 LOP.XOR R111, R111, 0x2000; +--:-:-:-:1 LDS.U.128 R64, [R106]; +--:-:-:-:1 LDS.U.128 R72, [R107]; +--:-:-:-:1 LDS.U.128 R68, [R106+0x100]; +--:-:1:-:1 LDS.U.128 R76, [R107+0x100]; +TARGET1: +--:-:-:-:1 ISETP.LE.AND P0, PT, R104, R105, PT; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R106+0x200]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R107+0x200]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R106+0x300]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R107+0x300]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:0 FFMA R11, R64.reuse, R74, R11; +--:-:2:-:1 @P0 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:0 FFMA R16, R66, R77.reuse, R16; +--:-:3:-:1 @P0 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R106+0x400]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R107+0x400]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R106+0x500]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R107+0x500]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R106+0x600]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R107+0x600]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R106+0x700]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R107+0x700]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R106+0x800]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R107+0x800]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R106+0x900]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R107+0x900]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R106+0xa00]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R107+0xa00]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R106+0xb00]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R107+0xb00]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R106+0xc00]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R107+0xc00]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R106+0xd00]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R107+0xd00]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R106+0xe00]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R107+0xe00]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R106+0xf00]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R107+0xf00]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:0 FFMA R10, R64.reuse, R75, R10; +02:-:-:-:1 @P0 STS.128 [R111], R96; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:0 FFMA R18, R64.reuse, R77.reuse, R18; +04:-:-:-:1 @P0 STS.128 [R111+0x800], R100; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:0 FFMA R26, R64.reuse, R79, R26; +01:-:-:-:5 BAR.SYNC 0x0; +--:-:-:-:1 @P0 LOP.XOR R106, R106, 0x2000; +--:-:-:-:1 @P0 LOP.XOR R107, R107, 0x2000; +--:-:-:-:1 @P0 LOP.XOR R111, R111, 0x2000; +--:-:-:-:1 FFMA R27, R64, R78, R27; +--:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 @P0 LDS.U.128 R64, [R106]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 @P0 LDS.U.128 R72, [R107]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 @P0 LDS.U.128 R68, [R106+0x100]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 @P0 LDS.U.128 R76, [R107+0x100]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +--:-:-:-:1 @P0 IADD R104, R104, R109.reuse; +--:-:-:-:0 @P0 IADD R108, R108, R109; +--:-:-:Y:5 @P0 BRA TARGET1; +--:-:-:-:1 SHR.U32 R84, R115, 0x2; +--:-:-:-:1 MOV R77, c[0x0][0x158]; +--:-:-:-:1 SHR.U32 R80, R116.reuse, 0x1; +--:-:-:-:1 MOV R72, c[0x0][0x15c]; +--:-:-:-:1 SHL R89, R116, 0x4; +--:-:-:-:1 LOP.AND R106, R106, 0xfff; +--:-:-:-:1 LOP.OR R84, R117, R84; +--:-:-:-:1 SHL R81, R77.reuse, 0x2; +--:-:-:-:1 LOP.AND R107, R107, 0xfff; +--:-:-:-:1 ISCADD R80, R114, R80, 0x7; +--:-:-:-:1 FMUL R64, R3, R72.reuse; +--:-:-:-:1 SHL R74, R77.reuse, 0x4; +--:-:-:-:1 LOP.OR R89, R89, R84; +--:-:-:-:1 ISCADD R84, R113, R84, 0x7; +--:-:-:-:1 FMUL R65, R7, R72.reuse; +--:-:-:-:1 SHL R88, R77, 0x5; +--:-:-:-:1 XMAD.MRG R78, R80.reuse, R77.H1.reuse, RZ; +--:-:-:-:1 ISCADD R90, R107, R106, 0x5; +--:-:-:-:1 FMUL R66, R1, R72.reuse; +--:-:-:-:1 SHL R89, R89, 0x2; +--:-:-:-:1 XMAD R73, R80, R77, R84; +--:-:-:-:1 ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT; +--:-:-:-:1 IADD R84, R84, 0x40; +--:-:-:-:1 ISCADD R85, R77, -R74, 0x8; +--:-:-:-:1 FMUL R67, R5, R72.reuse; +--:-:-:-:1 FMUL R68, R35, R72.reuse; +--:-:-:-:1 XMAD.PSL.CBCC R73, R80.H1, R78.H1, R73; +--:-:-:-:1 IADD R80, R80, -0x1; +--:-:-:-:1 ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT; +--:-:-:-:1 FMUL R69, R39, R72.reuse; +--:-:-:-:1 FMUL R70, R33, R72.reuse; +--:-:-:-:1 FMUL R71, R37, R72; +--:-:-:-:1 ISCADD R76, R73, c[0x0][0x140], 0x2; +--:-:-:-:1 IADD R83, R80.reuse, 0x4; +--:-:-:-:1 IADD R86, R80.reuse, 0x8; +--:-:-:-:3 IADD R87, R80, 0xc; +--:-:-:Y:6 IADD R76, R76, -R81; +--:-:-:-:1 IADD R75, R76.reuse, R74; +--:-:-:Y:5 IADD R79, R76, R88.reuse; +--:-:-:-:0 IADD R82, R75, R88; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R2, R72.reuse; +--:-:-:-:1 FMUL R65, R6, R72.reuse; +--:-:-:-:1 FMUL R66, R0, R72.reuse; +--:-:-:-:1 FMUL R67, R4, R72.reuse; +--:-:-:-:1 FMUL R68, R34, R72.reuse; +--:-:-:-:1 FMUL R69, R38, R72.reuse; +--:-:-:-:1 FMUL R70, R32, R72.reuse; +--:-:-:-:0 FMUL R71, R36, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R11, R72.reuse; +--:-:-:-:1 FMUL R65, R15, R72.reuse; +--:-:-:-:1 FMUL R66, R9, R72.reuse; +--:-:-:-:1 FMUL R67, R13, R72.reuse; +--:-:-:-:1 FMUL R68, R43, R72.reuse; +--:-:-:-:1 FMUL R69, R47, R72.reuse; +--:-:-:-:1 FMUL R70, R41, R72.reuse; +--:-:-:-:0 FMUL R71, R45, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R10, R72.reuse; +--:-:-:-:1 FMUL R65, R14, R72.reuse; +--:-:-:-:1 FMUL R66, R8, R72.reuse; +--:-:-:-:1 FMUL R67, R12, R72.reuse; +--:-:-:-:1 FMUL R68, R42, R72.reuse; +--:-:-:-:1 FMUL R69, R46, R72.reuse; +--:-:-:-:1 FMUL R70, R40, R72.reuse; +--:-:-:-:0 FMUL R71, R44, R72; +--:-:-:-:5 CAL TARGET2; +--:-:-:-:1 IADD R80, R80, 0x3c; +--:-:-:-:1 IADD R83, R83, 0x3c; +--:-:-:-:1 IADD R86, R86, 0x3c; +--:-:-:-:1 IADD R87, R87, 0x3c; +02:-:-:-:1 IADD R76, R76, R85.reuse; +--:-:-:-:1 IADD R75, R75, R85.reuse; +--:-:-:-:1 IADD R79, R79, R85.reuse; +--:-:-:-:1 IADD R82, R82, R85; +--:-:-:-:1 FMUL R64, R19, R72.reuse; +--:-:-:-:1 FMUL R65, R23, R72.reuse; +--:-:-:-:1 FMUL R66, R17, R72.reuse; +--:-:-:-:1 FMUL R67, R21, R72.reuse; +--:-:-:-:1 FMUL R68, R51, R72.reuse; +--:-:-:-:1 FMUL R69, R55, R72.reuse; +--:-:-:-:1 FMUL R70, R49, R72.reuse; +--:-:-:-:0 FMUL R71, R53, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R18, R72.reuse; +--:-:-:-:1 FMUL R65, R22, R72.reuse; +--:-:-:-:1 FMUL R66, R16, R72.reuse; +--:-:-:-:1 FMUL R67, R20, R72.reuse; +--:-:-:-:1 FMUL R68, R50, R72.reuse; +--:-:-:-:1 FMUL R69, R54, R72.reuse; +--:-:-:-:1 FMUL R70, R48, R72.reuse; +--:-:-:-:0 FMUL R71, R52, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R27, R72.reuse; +--:-:-:-:1 FMUL R65, R31, R72.reuse; +--:-:-:-:1 FMUL R66, R25, R72.reuse; +--:-:-:-:1 FMUL R67, R29, R72.reuse; +--:-:-:-:1 FMUL R68, R59, R72.reuse; +--:-:-:-:1 FMUL R69, R63, R72.reuse; +--:-:-:-:1 FMUL R70, R57, R72.reuse; +--:-:-:-:0 FMUL R71, R61, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R26, R72.reuse; +--:-:-:-:1 FMUL R65, R30, R72.reuse; +--:-:-:-:1 FMUL R66, R24, R72.reuse; +--:-:-:-:1 FMUL R67, R28, R72.reuse; +--:-:-:-:1 FMUL R68, R58, R72.reuse; +--:-:-:-:1 FMUL R69, R62, R72.reuse; +--:-:-:-:1 FMUL R70, R56, R72.reuse; +--:-:-:-:0 FMUL R71, R60, R72; +--:-:-:-:5 CAL TARGET2; +--:-:-:-:5 EXIT; +TARGET2: +--:-:-:-:0 IADD R80, R80, 0x1; +--:-:-:-:1 STS.128 [R90], R64; +--:-:-:-:0 IADD R83, R83, 0x1; +--:-:-:-:1 STS.128 [R90+0x100], R68; +--:-:-:-:0 IADD R86, R86, 0x1; +--:-:-:-:1 LDS R64, [R89]; +--:-:-:-:0 IADD R87, R87, 0x1; +--:-:-:-:1 LDS R65, [R89+0x100]; +--:-:-:-:0 IADD R76, R76, R81.reuse; +--:-:-:-:1 LDS R66, [R89+0x200]; +--:-:-:-:0 IADD R75, R75, R81.reuse; +--:-:-:-:1 LDS R67, [R89+0x300]; +--:-:-:-:0 IADD R79, R79, R81.reuse; +--:-:-:-:1 LDS R68, [R89+0x400]; +--:-:-:-:0 IADD R82, R82, R81; +--:-:-:-:1 LDS R69, [R89+0x500]; +--:-:-:-:1 ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5; +--:-:-:-:1 LDS R70, [R89+0x600]; +--:-:-:-:1 ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6; +--:-:1:-:1 LDS R71, [R89+0x700]; +--:-:-:-:2 ISETP.LT.AND P2, PT, R83.reuse, c[0x0][0x148], P5; +--:-:-:Y:7 ISETP.LT.AND P3, PT, R83, c[0x0][0x148], P6; +01:-:-:-:1 @P0 STG.CG [R76], R64; +--:-:-:-:1 ISETP.LT.AND P0, PT, R86.reuse, c[0x0][0x148], P5; +--:-:-:-:1 @P1 STG.CG [R76+0x100], R65; +--:-:-:-:1 ISETP.LT.AND P1, PT, R86, c[0x0][0x148], P6; +--:-:-:-:1 @P2 STG.CG [R75], R66; +--:-:-:-:1 ISETP.LT.AND P2, PT, R87.reuse, c[0x0][0x148], P5; +--:-:-:-:1 @P3 STG.CG [R75+0x100], R67; +--:-:-:Y:7 ISETP.LT.AND P3, PT, R87, c[0x0][0x148], P6; +--:-:-:-:2 @P0 STG.CG [R79], R68; +--:-:-:-:2 @P1 STG.CG [R79+0x100], R69; +--:-:-:-:2 @P2 STG.CG [R82], R70; +--:2:-:-:1 @P3 STG.CG [R82+0x100], R71; +--:-:-:-:5 RET; diff --git a/Assembler/MaxAs/sgemm/sgemm_final_64.sass b/Assembler/MaxAs/sgemm/sgemm_final_64.sass new file mode 100644 index 0000000..815ae5d --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm_final_64.sass @@ -0,0 +1,802 @@ +# Kernel: sgemm_kernel_64 +# Arch: sm_50 +# InsCnt: 779 +# RegCnt: 127 +# SharedSize: 8192 +# BarCnt: 1 +# Params(9): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 +# 3:0x14c:4:0 +# 4:0x150:4:0 +# 5:0x154:4:0 +# 6:0x158:4:0 +# 7:0x15c:4:0 +# 8:0x160:4:0 +# +# Instructions: + +--:-:1:-:1 S2R R119, SR_TID.X; +--:-:2:-:1 S2R R125, SR_CTAID.X; +--:-:3:-:1 S2R R122, SR_CTAID.Y; +01:-:-:-:1 ISETP.GE.AND P0, PT, R119.reuse, 0x20, PT; +--:-:-:-:1 LOP.AND R9, R119.reuse, 0xf; +--:-:-:-:1 BFE.U32 R4, R119.reuse, 0x104; +--:-:-:-:1 MOV R12, c[0x0][0x14c]; +--:-:-:-:1 BFE.U32 R114, R119.reuse, 0x301; +--:-:-:-:1 LOP.AND R115, R119.reuse, 0x30; +--:-:-:-:1 LOP.AND R0, R119.reuse, 0x1; +--:-:-:-:1 SHL R13, R9, 0x4; +--:-:-:-:1 LOP.AND R80, R119.reuse, -0x20; +--:-:-:-:1 IADD R12, R12, -0x8; +--:-:-:-:1 SHL R114, R114, 0x4; +--:-:-:-:1 LOP.AND R126, R119, 0x1f; +--:-:-:-:1 SHR.U32 R115, R115, 0x3; +--:-:-:-:0 @!P0 MOV R2, c[0x0][0x150]; +--:-:-:-:1 STS.128 [R80+0x1000], RZ; +--:-:-:-:1 @P0 MOV R2, c[0x0][0x154]; +--:-:-:-:1 ISCADD R118, R4, R13, 0x8; +06:-:-:-:1 SEL R8, R122, R125, P0; +--:-:-:-:1 @!P0 MOV32I R113, 0x80000001; +--:-:-:-:1 @P0 MOV32I R113, 0x80000000; +--:-:-:-:1 LOP.OR R115, R115, R0; +--:-:-:-:1 SHR.U32 R1, R2.reuse, 0x2; +--:-:-:-:1 LOP.AND R123, R119, 0x20; +--:-:-:-:1 ISCADD R112, R8, R9, 0x4; +--:-:-:-:1 IADD R121, R2, R2; +--:-:-:-:1 @P0 IADD R118, R118, 0x800; +--:-:-:-:1 ISCADD R115, R115, 0x800, 0x4; +--:-:-:-:1 XMAD.MRG R5, R1.reuse, R4.H1.reuse, RZ; +--:-:-:-:1 XMAD.MRG R16, R12.reuse, R1.H1.reuse, RZ; +--:-:-:Y:6 XMAD R112, R1.reuse, R4, R112; +--:-:-:-:2 XMAD.PSL.CBCC R112, R1.H1, R5.H1, R112; +--:-:1:-:4 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf; +--:-:-:-:1 IADD3 R116, R112.reuse, R1.reuse, R1; +--:-:-:-:1 IADD R120, R112, R2.reuse; +--:-:2:-:1 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf; +--:-:-:-:0 XMAD R117, R12.reuse, R1, R112; +--:-:3:-:3 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf; +--:-:-:-:2 IADD R124, R116, R2; +--:-:4:-:1 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf; +--:-:-:-:1 XMAD.PSL.CBCC R117, R12.H1, R16.H1, R117; +--:-:5:-:1 LDS.U.128 R0, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R4, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R8, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R12, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R16, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R20, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R24, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R28, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R32, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R36, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R40, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R44, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R48, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R52, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R56, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R60, [R80+0x1000]; +01:-:-:-:1 STS.128 [R118], R96; +--:-:-:-:0 IADD R112, R112, R121.reuse; +02:-:-:-:1 STS.128 [R118+0x200], R100; +--:-:-:-:0 IADD R116, R116, R121.reuse; +04:-:-:-:1 STS.128 [R118+0x400], R104; +--:-:-:-:0 IADD R120, R120, R121.reuse; +08:-:-:-:1 STS.128 [R118+0x600], R108; +--:-:-:-:0 IADD R124, R124, R121; +10:-:-:-:5 BAR.SYNC 0x0; +--:-:-:-:0 LOP.XOR R118, R118, 0x1000; +--:-:-:-:1 LDS.U.128 R64, [R114]; +--:-:-:-:1 LDS.U.128 R72, [R115]; +--:-:-:-:1 LDS.U.128 R68, [R114+0x80]; +--:-:1:-:1 LDS.U.128 R76, [R115+0x80]; +TARGET1: +--:-:-:-:1 ISETP.LE.AND P0, PT, R112, R117, PT; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R114+0x100]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R115+0x100]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R114+0x180]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R115+0x180]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:0 FFMA R11, R64.reuse, R74, R11; +--:-:-:-:1 @P0 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:0 FFMA R16, R66, R77.reuse, R16; +--:-:2:-:1 @P0 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R114+0x200]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R115+0x200]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R114+0x280]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R115+0x280]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:0 FFMA R11, R80.reuse, R90, R11; +--:-:-:-:1 @P0 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:0 FFMA R16, R82, R93.reuse, R16; +--:-:3:-:1 @P0 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R114+0x300]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R115+0x300]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R114+0x380]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R115+0x380]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R114+0x400]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R115+0x400]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R114+0x480]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R115+0x480]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R114+0x500]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R115+0x500]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R114+0x580]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R115+0x580]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R114+0x600]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R115+0x600]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R114+0x680]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R115+0x680]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:0 FFMA R10, R80.reuse, R91, R10; +02:-:-:-:1 @P0 STS.128 [R118], R96; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:0 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 @P0 STS.128 [R118+0x200], R100; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R114+0x700]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R115+0x700]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R114+0x780]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R115+0x780]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:0 FFMA R10, R64.reuse, R75, R10; +04:-:-:-:1 @P0 STS.128 [R118+0x400], R104; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:0 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 @P0 STS.128 [R118+0x600], R108; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:0 FFMA R26, R64.reuse, R79, R26; +01:-:-:-:5 BAR.SYNC 0x0; +--:-:-:-:1 @P0 LOP.XOR R114, R114, 0x1000; +--:-:-:-:1 @P0 LOP.XOR R115, R115, 0x1000; +--:-:-:-:1 @P0 LOP.XOR R118, R118, 0x1000; +--:-:-:-:1 FFMA R27, R64, R78, R27; +--:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 @P0 LDS.U.128 R64, [R114]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 @P0 LDS.U.128 R72, [R115]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 @P0 LDS.U.128 R68, [R114+0x80]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 @P0 LDS.U.128 R76, [R115+0x80]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +--:-:-:-:1 @P0 IADD R112, R112, R121.reuse; +--:-:-:-:1 @P0 IADD R116, R116, R121.reuse; +--:-:-:-:1 @P0 IADD R120, R120, R121.reuse; +--:-:-:-:0 @P0 IADD R124, R124, R121; +--:-:-:Y:5 @P0 BRA TARGET1; +--:-:-:-:1 SHR.U32 R80, R123.reuse, 0x1; +--:-:-:-:1 MOV R81, c[0x0][0x158]; +--:-:-:-:1 ISCADD R84, R125, R126.reuse, 0x6; +--:-:-:-:1 MOV R72, c[0x0][0x15c]; +--:-:-:-:1 ISCADD R92, R123, R126, 0x3; +--:-:-:-:1 LOP.AND R114, R114, 0x7ff; +--:-:-:-:1 ISCADD R80, R122, R80, 0x6; +--:-:-:-:1 LOP.AND R115, R115, 0x7ff; +--:-:-:-:1 SHL R77, R81.reuse, 0x2; +--:-:-:-:1 ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT; +--:-:-:-:1 SHL R89, R81.reuse, 0x4; +--:-:-:-:1 FMUL R64, R3, R72; +--:-:-:-:1 SHL R91, R81.reuse, 0x5; +--:-:-:-:1 XMAD.MRG R74, R80.reuse, R81.H1.reuse, RZ; +--:-:-:-:1 ISCADD R93, R115, R114, 0x4; +--:-:-:-:1 XMAD R73, R80, R81, R84; +--:-:-:-:1 SHL R92, R92, 0x2; +--:-:-:-:1 IADD R84, R84, 0x20; +--:-:-:-:1 ISCADD R85, R81, -R89, 0x7; +--:-:-:-:1 FMUL R65, R7, R72.reuse; +--:-:-:-:1 FMUL R66, R1, R72.reuse; +--:-:-:-:1 XMAD.PSL.CBCC R73, R80.H1, R74.H1, R73; +--:-:-:-:1 IADD R80, R80, -0x1; +--:-:-:-:1 ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT; +--:-:-:-:1 FMUL R67, R5, R72.reuse; +--:-:-:-:1 FMUL R68, R35, R72.reuse; +--:-:-:-:1 FMUL R69, R39, R72.reuse; +--:-:-:-:1 ISCADD R76, R73, c[0x0][0x140], 0x2; +--:-:-:-:1 IADD R86, R80.reuse, 0x4; +--:-:-:-:1 IADD R87, R80.reuse, 0x8; +--:-:-:-:1 IADD R88, R80, 0xc; +--:-:-:-:1 FMUL R70, R33, R72.reuse; +--:-:-:-:1 FMUL R71, R37, R72; +--:-:-:Y:6 IADD R76, R76, -R77; +--:-:-:-:1 IADD R75, R76.reuse, R89; +--:-:-:Y:5 IADD R78, R76, R91.reuse; +--:-:-:-:0 IADD R79, R75, R91; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R2, R72.reuse; +--:-:-:-:1 FMUL R65, R6, R72.reuse; +--:-:-:-:1 FMUL R66, R0, R72.reuse; +--:-:-:-:1 FMUL R67, R4, R72.reuse; +--:-:-:-:1 FMUL R68, R34, R72.reuse; +--:-:-:-:1 FMUL R69, R38, R72.reuse; +--:-:-:-:1 FMUL R70, R32, R72.reuse; +--:-:-:-:0 FMUL R71, R36, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R11, R72.reuse; +--:-:-:-:1 FMUL R65, R15, R72.reuse; +--:-:-:-:1 FMUL R66, R9, R72.reuse; +--:-:-:-:1 FMUL R67, R13, R72.reuse; +--:-:-:-:1 FMUL R68, R43, R72.reuse; +--:-:-:-:1 FMUL R69, R47, R72.reuse; +--:-:-:-:1 FMUL R70, R41, R72.reuse; +--:-:-:-:0 FMUL R71, R45, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R10, R72.reuse; +--:-:-:-:1 FMUL R65, R14, R72.reuse; +--:-:-:-:1 FMUL R66, R8, R72.reuse; +--:-:-:-:1 FMUL R67, R12, R72.reuse; +--:-:-:-:1 FMUL R68, R42, R72.reuse; +--:-:-:-:1 FMUL R69, R46, R72.reuse; +--:-:-:-:1 FMUL R70, R40, R72.reuse; +--:-:-:-:0 FMUL R71, R44, R72; +--:-:-:-:5 CAL TARGET2; +--:-:-:-:1 IADD R80, R80, 0x1c; +--:-:-:-:1 IADD R86, R86, 0x1c; +--:-:-:-:1 IADD R87, R87, 0x1c; +--:-:-:-:1 IADD R88, R88, 0x1c; +02:-:-:-:1 IADD R76, R76, R85.reuse; +--:-:-:-:1 IADD R75, R75, R85.reuse; +--:-:-:-:1 IADD R78, R78, R85.reuse; +--:-:-:-:1 IADD R79, R79, R85; +--:-:-:-:1 FMUL R64, R19, R72.reuse; +--:-:-:-:1 FMUL R65, R23, R72.reuse; +--:-:-:-:1 FMUL R66, R17, R72.reuse; +--:-:-:-:1 FMUL R67, R21, R72.reuse; +--:-:-:-:1 FMUL R68, R51, R72.reuse; +--:-:-:-:1 FMUL R69, R55, R72.reuse; +--:-:-:-:1 FMUL R70, R49, R72.reuse; +--:-:-:-:0 FMUL R71, R53, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R18, R72.reuse; +--:-:-:-:1 FMUL R65, R22, R72.reuse; +--:-:-:-:1 FMUL R66, R16, R72.reuse; +--:-:-:-:1 FMUL R67, R20, R72.reuse; +--:-:-:-:1 FMUL R68, R50, R72.reuse; +--:-:-:-:1 FMUL R69, R54, R72.reuse; +--:-:-:-:1 FMUL R70, R48, R72.reuse; +--:-:-:-:0 FMUL R71, R52, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R27, R72.reuse; +--:-:-:-:1 FMUL R65, R31, R72.reuse; +--:-:-:-:1 FMUL R66, R25, R72.reuse; +--:-:-:-:1 FMUL R67, R29, R72.reuse; +--:-:-:-:1 FMUL R68, R59, R72.reuse; +--:-:-:-:1 FMUL R69, R63, R72.reuse; +--:-:-:-:1 FMUL R70, R57, R72.reuse; +--:-:-:-:0 FMUL R71, R61, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R26, R72.reuse; +--:-:-:-:1 FMUL R65, R30, R72.reuse; +--:-:-:-:1 FMUL R66, R24, R72.reuse; +--:-:-:-:1 FMUL R67, R28, R72.reuse; +--:-:-:-:1 FMUL R68, R58, R72.reuse; +--:-:-:-:1 FMUL R69, R62, R72.reuse; +--:-:-:-:1 FMUL R70, R56, R72.reuse; +--:-:-:-:0 FMUL R71, R60, R72; +--:-:-:-:5 CAL TARGET2; +--:-:-:-:5 EXIT; +TARGET2: +--:-:-:-:0 IADD R80, R80, 0x1; +--:-:-:-:1 STS.128 [R93], R64; +--:-:-:-:0 IADD R86, R86, 0x1; +--:-:-:-:1 STS.128 [R93+0x80], R68; +--:-:-:-:0 IADD R87, R87, 0x1; +--:-:-:-:1 LDS R64, [R92]; +--:-:-:-:0 IADD R88, R88, 0x1; +--:-:-:-:1 LDS R65, [R92+0x80]; +--:-:-:-:0 IADD R76, R76, R77.reuse; +--:-:-:-:1 LDS R66, [R92+0x100]; +--:-:-:-:0 IADD R75, R75, R77.reuse; +--:-:-:-:1 LDS R67, [R92+0x180]; +--:-:-:-:0 IADD R78, R78, R77.reuse; +--:-:-:-:1 LDS R68, [R92+0x200]; +--:-:-:-:0 IADD R79, R79, R77; +--:-:-:-:1 LDS R69, [R92+0x280]; +--:-:-:-:1 ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5; +--:-:-:-:1 LDS R70, [R92+0x300]; +--:-:-:-:1 ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6; +--:-:1:-:1 LDS R71, [R92+0x380]; +--:-:-:-:2 ISETP.LT.AND P2, PT, R86.reuse, c[0x0][0x148], P5; +--:-:-:Y:7 ISETP.LT.AND P3, PT, R86, c[0x0][0x148], P6; +01:-:-:-:1 @P0 STG.CG [R76], R64; +--:-:-:-:1 ISETP.LT.AND P0, PT, R87.reuse, c[0x0][0x148], P5; +--:-:-:-:1 @P1 STG.CG [R76+0x80], R65; +--:-:-:-:1 ISETP.LT.AND P1, PT, R87, c[0x0][0x148], P6; +--:-:-:-:1 @P2 STG.CG [R75], R66; +--:-:-:-:1 ISETP.LT.AND P2, PT, R88.reuse, c[0x0][0x148], P5; +--:-:-:-:1 @P3 STG.CG [R75+0x80], R67; +--:-:-:Y:7 ISETP.LT.AND P3, PT, R88, c[0x0][0x148], P6; +--:-:-:-:2 @P0 STG.CG [R78], R68; +--:-:-:-:2 @P1 STG.CG [R78+0x80], R69; +--:-:-:-:2 @P2 STG.CG [R79], R70; +--:2:-:-:1 @P3 STG.CG [R79+0x80], R71; +--:-:-:-:5 RET; diff --git a/Assembler/MaxAs/sgemm/sgemm_pre_128.sass b/Assembler/MaxAs/sgemm/sgemm_pre_128.sass new file mode 100644 index 0000000..cde320e --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm_pre_128.sass @@ -0,0 +1,924 @@ +# Kernel: sgemm_kernel_128 +# +# SharedSize: 16384 +# Params(8): +# 0:0x140:4:4 param_C, +# 1:0x144:4:0 param_m, +# 2:0x148:4:0 param_n, +# 3:0x14c:4:0 param_k, +# 4:0x150:4:0 param_lda, +# 5:0x154:4:0 param_ldb, +# 6:0x158:4:0 param_ldc +# 7:0x15c:4:0 param_alpha +# 8:0x160:4:4 param_D // for diagnostic printf output +# +# Globals: +# c[0x0][0x164]: texA (the value is 1) +# c[0x0][0x168]: texB (the value is 0) + + + + // Temporary registers to calculate the state registers. Reuse the C output registers. + // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts. + 0-63 ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy + + // Aliases for the C registers we use for initializing C (used as vectors) + 0-63 : cz<00-63> + + // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers + 80 : zOffset + + // 64 C maxtrix output registers. + // Use special mapping to avoid register bank conflicts between these registers and the blocking registers. + 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> + 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> + 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> + 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> + 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> + + // Double buffered register blocking used in vector loads. + // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags + 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> + 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> + + // Registers to load A or B + 96-103 : loadX<0-7> + + // Key global state registers for main loop and some we reuse for outputing C. + // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of + // delayed bank conflicts between memory operations and ffmas. + // The array index bracket notation can be used to request a bank in a dynamically allocated range. + 104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs + + // Registers to store the results back to global memory. Reuse any register not needed after the main loop. + // Statically allocate cs0-7 because they're vector registers. + 64-71 : cs<0-7> + + // dynamically allocated C output registers(~) + 72-103 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX + + + +// Note the absense of the loading of the stack pointer into R1. +// No idea why ptxas does that anyway when it's not used for register spilling. +// Such a waste of a perfectly good register. + +// Scheduler doesn't handle the dependency flags yet, +// so move these first instructions outside the block that's auto scheduled +//--:-:-:-:1 CS2R clock, SR_CLOCKLO; +//--:-:-:-:1 S2R smId, SR_VIRTID; +//--:-:-:-:1 S2R nSMs, SR_VIRTCFG; +--:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 +--:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 +--:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 + +// Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies +// Memory dependencies are left up to the auther to deal with manually for now. +01:-:-:Y:1 ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 BFE.U32 tid4, tid, 0x205; // 2 bits at position 5 +--:-:-:-:1 MOV k, c[0x0][0x14c]; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 LOP.AND readBs, tid, 0x70; +--:-:-:-:1 SHL tid31_4, tid31, 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 IADD k, k, -8; +--:-:-:-:1 LOP.AND zOffset, tid, -32; +--:-:-:-:1 SHR.U32 readAs, tid128, 4; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:0 @!P0 MOV ldx4, c[0x0][0x150]; +--:-:-:-:1 STS.128 [zOffset + 4x<16*128>], RZ; +--:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; +--:-:-:-:1 ISCADD writeS, tid4, tid31_4, 9; +06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 +--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA +--:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHR.U32 ldx, ldx4, 2; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD track0, blk, tid31, 5; +--:-:-:-:1 IADD ldx8, ldx4, ldx4; +--:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*128>; +--:-:-:-:1 SHL readAs, readAs, 4; +--:-:-:-:1 XMAD.MRG xmad_t0, ldx, tid4.H1, RZ; // XMAD.LO is a macro that is expanded out into the 3 XMADs +--:-:-:-:1 ISCADD readBs, readBs, 4x<8*128>, 4; +--:-:-:-:1 XMAD track0, ldx, tid4, track0; +--:-:-:Y:5 XMAD.MRG xmad_end, k, ldx.H1, RZ; +--:-:-:-:2 XMAD.PSL.CBCC track0, ldx.H1, xmad_t0.H1, track0; +--:-:1:-:4 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:-:-:1 IADD track4, track0, ldx4; +--:-:-:-:1 XMAD end, k, ldx, track0; +--:-:2:Y:5 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:-:1 XMAD.PSL.CBCC end, k.H1, xmad_end.H1, end; + +// Initialize C registeres to zero +// Using LDS.U.128 is a neat trick to save a few clock cyles +// (when you have enough warps to hide the latency.) +--:-:3:-:1 LDS.U.128 cz00, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz04, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz08, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz12, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz16, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz20, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz24, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz28, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz32, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz36, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz40, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz44, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz48, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz52, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz56, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz60, [zOffset + 4x<16*128>]; + +// These instuctions need to occur after the textures load so put them in a new block +// that starts with a dependency barrier wait. +01:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1 +--:-:-:-:0 IADD track0, track0, ldx8; +02:-:-:-:1 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2 +--:-:-:-:0 IADD track4, track4, ldx8; +04:-:-:-:5 BAR.SYNC 0; + +// The next store to shared goes to high area. +// Having 2 share buffers allows us to eliminate a bar.sync in the main loop. +// This way we don't have to wait for all threads to arrive before writing fresh data to shared. +// Other threads can continue reading from the last batch while the new data is being written. +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*128>; + +// Preload the fist lines of A and B from shared +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1 + + +// The main loop +// While calculating the first line, load in the next line from shared. +// Shared memory stores enough to do this 8 times per loop. +// Also pull in the next block of memory from global and store it to shared. + +// Efficiency: +// ffma: 512 +// lds: 32 dual issued +// sts: 2 dual issued +// tex: 2 dual issued +// add: 2 +// xor: 3 +// setp: 1 +// bar: 1 dual issued +// bra: 1 dual issued +// Total: 524 (512/518 = 98.8% FFMA) + +// Memory Throughput Upper Bound: +// 2 * 4 * 4 bytes per thread per 518 clocks +// 128 threads per SM +// 16 SM's (GM204) +// 1640Mhz (boost overclock) +// .931 GiB/GB (1000^3 / 1024^3) +// 193 GiB/sec +// Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz) + +LOOP: + +// Loop end condition +--:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; + +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<1*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<1*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax64, [readAs + 4x<1*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By64, [readBs + 4x<1*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j0Ax66, j0By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j0Ax66, j0By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j0Ax64, j0By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j0Ax64, j0By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j0Ax67, j0By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j0Ax67, j0By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j0Ax65, j0By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j0Ax65, j0By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j0Ax67, j0By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j0Ax67, j0By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j0Ax65, j0By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j0Ax65, j0By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j0Ax66, j0By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j0Ax66, j0By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j0Ax64, j0By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j0Ax64, j0By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:0 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:2:-:1 @P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:Y:1 FFMA cx02y64, j0Ax02, j0By64, cx02y64; +--:-:-:-:0 FFMA cx02y65, j0Ax02, j0By65, cx02y65; +--:-:3:-:1 @P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3 +--:-:-:-:1 FFMA cx00y65, j0Ax00, j0By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j0Ax00, j0By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j0Ax03, j0By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j0Ax03, j0By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j0Ax01, j0By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j0Ax01, j0By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j0Ax66, j0By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j0Ax66, j0By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j0Ax64, j0By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j0Ax64, j0By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j0Ax67, j0By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j0Ax67, j0By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j0Ax65, j0By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j0Ax65, j0By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j0Ax67, j0By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j0Ax67, j0By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j0Ax65, j0By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j0Ax65, j0By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j0Ax66, j0By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j0Ax66, j0By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j0Ax64, j0By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j0Ax64, j0By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j0Ax03, j0By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j0Ax03, j0By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j0Ax01, j0By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j0Ax01, j0By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j0Ax02, j0By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j0Ax02, j0By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j0Ax00, j0By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j0Ax00, j0By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<2*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<2*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<2*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<2*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j1Ax66, j1By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j1Ax66, j1By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j1Ax64, j1By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j1Ax64, j1By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j1Ax67, j1By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j1Ax67, j1By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j1Ax65, j1By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j1Ax65, j1By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j1Ax67, j1By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j1Ax67, j1By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j1Ax65, j1By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j1Ax65, j1By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j1Ax66, j1By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j1Ax66, j1By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j1Ax64, j1By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j1Ax64, j1By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j1Ax02, j1By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j1Ax02, j1By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j1Ax00, j1By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j1Ax00, j1By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j1Ax03, j1By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j1Ax03, j1By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j1Ax01, j1By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j1Ax01, j1By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j1Ax66, j1By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j1Ax66, j1By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j1Ax64, j1By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j1Ax64, j1By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j1Ax67, j1By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j1Ax67, j1By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j1Ax65, j1By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j1Ax65, j1By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j1Ax67, j1By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j1Ax67, j1By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j1Ax65, j1By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j1Ax65, j1By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j1Ax66, j1By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j1Ax66, j1By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j1Ax64, j1By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j1Ax64, j1By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j1Ax03, j1By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j1Ax03, j1By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j1Ax01, j1By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j1Ax01, j1By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j1Ax02, j1By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j1Ax02, j1By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j1Ax00, j1By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j1Ax00, j1By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<3*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<3*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax64, [readAs + 4x<3*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By64, [readBs + 4x<3*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j0Ax66, j0By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j0Ax66, j0By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j0Ax64, j0By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j0Ax64, j0By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j0Ax67, j0By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j0Ax67, j0By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j0Ax65, j0By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j0Ax65, j0By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j0Ax67, j0By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j0Ax67, j0By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j0Ax65, j0By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j0Ax65, j0By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j0Ax66, j0By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j0Ax66, j0By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j0Ax64, j0By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j0Ax64, j0By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j0Ax02, j0By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j0Ax02, j0By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j0Ax00, j0By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j0Ax00, j0By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j0Ax03, j0By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j0Ax03, j0By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j0Ax01, j0By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j0Ax01, j0By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j0Ax66, j0By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j0Ax66, j0By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j0Ax64, j0By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j0Ax64, j0By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j0Ax67, j0By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j0Ax67, j0By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j0Ax65, j0By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j0Ax65, j0By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j0Ax67, j0By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j0Ax67, j0By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j0Ax65, j0By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j0Ax65, j0By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j0Ax66, j0By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j0Ax66, j0By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j0Ax64, j0By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j0Ax64, j0By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j0Ax03, j0By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j0Ax03, j0By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j0Ax01, j0By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j0Ax01, j0By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j0Ax02, j0By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j0Ax02, j0By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j0Ax00, j0By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j0Ax00, j0By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<4*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<4*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<4*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<4*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j1Ax66, j1By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j1Ax66, j1By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j1Ax64, j1By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j1Ax64, j1By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j1Ax67, j1By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j1Ax67, j1By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j1Ax65, j1By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j1Ax65, j1By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j1Ax67, j1By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j1Ax67, j1By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j1Ax65, j1By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j1Ax65, j1By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j1Ax66, j1By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j1Ax66, j1By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j1Ax64, j1By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j1Ax64, j1By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j1Ax02, j1By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j1Ax02, j1By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j1Ax00, j1By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j1Ax00, j1By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j1Ax03, j1By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j1Ax03, j1By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j1Ax01, j1By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j1Ax01, j1By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j1Ax66, j1By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j1Ax66, j1By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j1Ax64, j1By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j1Ax64, j1By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j1Ax67, j1By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j1Ax67, j1By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j1Ax65, j1By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j1Ax65, j1By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j1Ax67, j1By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j1Ax67, j1By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j1Ax65, j1By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j1Ax65, j1By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j1Ax66, j1By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j1Ax66, j1By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j1Ax64, j1By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j1Ax64, j1By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j1Ax03, j1By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j1Ax03, j1By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j1Ax01, j1By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j1Ax01, j1By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j1Ax02, j1By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j1Ax02, j1By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j1Ax00, j1By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j1Ax00, j1By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<5*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<5*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax64, [readAs + 4x<5*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By64, [readBs + 4x<5*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j0Ax66, j0By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j0Ax66, j0By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j0Ax64, j0By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j0Ax64, j0By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j0Ax67, j0By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j0Ax67, j0By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j0Ax65, j0By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j0Ax65, j0By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j0Ax67, j0By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j0Ax67, j0By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j0Ax65, j0By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j0Ax65, j0By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j0Ax66, j0By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j0Ax66, j0By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j0Ax64, j0By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j0Ax64, j0By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j0Ax02, j0By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j0Ax02, j0By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j0Ax00, j0By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j0Ax00, j0By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j0Ax03, j0By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j0Ax03, j0By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j0Ax01, j0By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j0Ax01, j0By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j0Ax66, j0By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j0Ax66, j0By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j0Ax64, j0By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j0Ax64, j0By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j0Ax67, j0By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j0Ax67, j0By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j0Ax65, j0By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j0Ax65, j0By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j0Ax67, j0By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j0Ax67, j0By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j0Ax65, j0By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j0Ax65, j0By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j0Ax66, j0By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j0Ax66, j0By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j0Ax64, j0By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j0Ax64, j0By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j0Ax03, j0By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j0Ax03, j0By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j0Ax01, j0By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j0Ax01, j0By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j0Ax02, j0By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j0Ax02, j0By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j0Ax00, j0By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j0Ax00, j0By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<6*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<6*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<6*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<6*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j1Ax66, j1By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j1Ax66, j1By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j1Ax64, j1By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j1Ax64, j1By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j1Ax67, j1By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j1Ax67, j1By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j1Ax65, j1By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j1Ax65, j1By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j1Ax67, j1By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j1Ax67, j1By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j1Ax65, j1By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j1Ax65, j1By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j1Ax66, j1By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j1Ax66, j1By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j1Ax64, j1By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j1Ax64, j1By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j1Ax02, j1By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j1Ax02, j1By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j1Ax00, j1By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j1Ax00, j1By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j1Ax03, j1By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j1Ax03, j1By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j1Ax01, j1By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j1Ax01, j1By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j1Ax66, j1By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j1Ax66, j1By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j1Ax64, j1By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j1Ax64, j1By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j1Ax67, j1By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j1Ax67, j1By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j1Ax65, j1By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j1Ax65, j1By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j1Ax67, j1By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j1Ax67, j1By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j1Ax65, j1By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j1Ax65, j1By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j1Ax66, j1By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j1Ax66, j1By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j1Ax64, j1By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j1Ax64, j1By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j1Ax03, j1By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j1Ax03, j1By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j1Ax01, j1By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j1Ax01, j1By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j1Ax02, j1By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j1Ax02, j1By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j1Ax00, j1By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j1Ax00, j1By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<7*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<7*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax64, [readAs + 4x<7*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By64, [readBs + 4x<7*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j0Ax66, j0By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j0Ax66, j0By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j0Ax64, j0By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j0Ax64, j0By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j0Ax67, j0By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j0Ax67, j0By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j0Ax65, j0By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j0Ax65, j0By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j0Ax67, j0By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j0Ax67, j0By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j0Ax65, j0By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j0Ax65, j0By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j0Ax66, j0By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j0Ax66, j0By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j0Ax64, j0By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j0Ax64, j0By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:0 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +02:-:-:-:1 @P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2 +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j0Ax02, j0By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j0Ax02, j0By65, cx02y65; +--:-:-:-:0 FFMA cx00y65, j0Ax00, j0By65, cx00y65; +04:-:-:-:1 @P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3 +--:-:-:-:1 FFMA cx00y64, j0Ax00, j0By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j0Ax03, j0By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j0Ax03, j0By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j0Ax01, j0By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j0Ax01, j0By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j0Ax66, j0By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j0Ax66, j0By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j0Ax64, j0By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j0Ax64, j0By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j0Ax67, j0By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j0Ax67, j0By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j0Ax65, j0By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j0Ax65, j0By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j0Ax67, j0By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j0Ax67, j0By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j0Ax65, j0By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j0Ax65, j0By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j0Ax66, j0By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j0Ax66, j0By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j0Ax64, j0By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j0Ax64, j0By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j0Ax03, j0By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j0Ax03, j0By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j0Ax01, j0By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j0Ax01, j0By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j0Ax02, j0By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j0Ax02, j0By67, cx02y67; +--:-:-:-:0 FFMA cx00y67, j0Ax00, j0By67, cx00y67; +01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1 +--:-:-:-:1 @P0 LOP.XOR readAs, readAs, 4x<16*128>; +--:-:-:-:1 @P0 LOP.XOR readBs, readBs, 4x<16*128>; +--:-:-:-:1 @P0 LOP.XOR writeS, writeS, 4x<16*128>; +--:-:-:-:1 FFMA cx00y66, j0Ax00, j0By66, cx00y66; +--:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; +--:-:-:-:1 @P0 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 @P0 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 @P0 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 @P0 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j1Ax66, j1By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j1Ax66, j1By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j1Ax64, j1By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j1Ax64, j1By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j1Ax67, j1By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j1Ax67, j1By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j1Ax65, j1By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j1Ax65, j1By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j1Ax67, j1By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j1Ax67, j1By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j1Ax65, j1By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j1Ax65, j1By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j1Ax66, j1By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j1Ax66, j1By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j1Ax64, j1By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j1Ax64, j1By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j1Ax02, j1By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j1Ax02, j1By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j1Ax00, j1By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j1Ax00, j1By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j1Ax03, j1By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j1Ax03, j1By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j1Ax01, j1By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j1Ax01, j1By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j1Ax66, j1By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j1Ax66, j1By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j1Ax64, j1By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j1Ax64, j1By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j1Ax67, j1By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j1Ax67, j1By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j1Ax65, j1By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j1Ax65, j1By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j1Ax67, j1By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j1Ax67, j1By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j1Ax65, j1By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j1Ax65, j1By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j1Ax66, j1By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j1Ax66, j1By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j1Ax64, j1By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j1Ax64, j1By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j1Ax03, j1By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j1Ax03, j1By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j1Ax01, j1By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j1Ax01, j1By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j1Ax02, j1By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j1Ax02, j1By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j1Ax00, j1By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j1Ax00, j1By66, cx00y66; +--:-:-:-:1 @P0 IADD track0, track0, ldx8; +--:-:-:-:0 @P0 IADD track4, track4, ldx8; +--:-:-:Y:5 @P0 BRA LOOP; + +// Main loop is done, time to write C to global memory. +--:-:-:-:1 SHR.U32 cx, tid128, 2; +--:-:-:-:1 MOV ldc, c[0x0][0x158]; +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 MOV alpha, c[0x0][0x15c]; +--:-:-:-:1 SHL readCs, tid96, 4; +--:-:-:-:1 LOP.AND readAs, readAs, 0xfff; +--:-:-:-:1 LOP.OR cx, tid31, cx; +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 LOP.AND readBs, readBs, 0xfff; +--:-:-:-:1 ISCADD cy00, by, cy00, 7; +--:-:-:-:1 FMUL cs0, cx00y00, alpha; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 LOP.OR readCs, readCs, cx; +--:-:-:-:1 ISCADD cx, bx, cx, 7; +--:-:-:-:1 FMUL cs1, cx01y00, alpha; +--:-:-:-:1 SHL ldc8, ldc, 5; +--:-:-:-:1 XMAD.MRG xmad_ci, cy00, ldc.H1, RZ; +--:-:-:-:1 ISCADD writeCs, readBs, readAs, 5; +--:-:-:-:1 FMUL cs2, cx02y00, alpha; +--:-:-:-:1 SHL readCs, readCs, 2; +--:-:-:-:1 XMAD ci, cy00, ldc, cx; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m +--:-:-:-:1 IADD cx, cx, 64; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; +--:-:-:-:1 FMUL cs3, cx03y00, alpha; +--:-:-:-:1 FMUL cs4, cx64y00, alpha; +--:-:-:-:1 XMAD.PSL.CBCC ci, cy00.H1, xmad_ci.H1, ci; +--:-:-:-:1 IADD cy00, cy00, -1; +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m +--:-:-:-:1 FMUL cs5, cx65y00, alpha; +--:-:-:-:1 FMUL cs6, cx66y00, alpha; +--:-:-:-:1 FMUL cs7, cx67y00, alpha; +--:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:3 IADD cy12, cy00, 12; +--:-:-:Y:6 IADD Cy00, Cy00, -ldc1; +--:-:-:-:1 IADD Cy04, Cy00, ldc4; +--:-:-:Y:5 IADD Cy08, Cy00, ldc8; +--:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) + +// There's nothing yet in place to handle dependecies with subroutines. +// So don't schedule this block. +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y01, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y01, alpha; +--:-:-:-:1 FMUL cs2, cx02y01, alpha; +--:-:-:-:1 FMUL cs3, cx03y01, alpha; +--:-:-:-:1 FMUL cs4, cx64y01, alpha; +--:-:-:-:1 FMUL cs5, cx65y01, alpha; +--:-:-:-:1 FMUL cs6, cx66y01, alpha; +--:-:-:-:0 FMUL cs7, cx67y01, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y02, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y02, alpha; +--:-:-:-:1 FMUL cs2, cx02y02, alpha; +--:-:-:-:1 FMUL cs3, cx03y02, alpha; +--:-:-:-:1 FMUL cs4, cx64y02, alpha; +--:-:-:-:1 FMUL cs5, cx65y02, alpha; +--:-:-:-:1 FMUL cs6, cx66y02, alpha; +--:-:-:-:0 FMUL cs7, cx67y02, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y03, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y03, alpha; +--:-:-:-:1 FMUL cs2, cx02y03, alpha; +--:-:-:-:1 FMUL cs3, cx03y03, alpha; +--:-:-:-:1 FMUL cs4, cx64y03, alpha; +--:-:-:-:1 FMUL cs5, cx65y03, alpha; +--:-:-:-:1 FMUL cs6, cx66y03, alpha; +--:-:-:-:0 FMUL cs7, cx67y03, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 IADD cy00, cy00, 60; +--:-:-:-:1 IADD cy04, cy04, 60; +--:-:-:-:1 IADD cy08, cy08, 60; +--:-:-:-:1 IADD cy12, cy12, 60; + +02:-:-:-:1 IADD Cy00, Cy00, ldc60; // Wait Dep 2 +--:-:-:-:1 IADD Cy04, Cy04, ldc60; +--:-:-:-:1 IADD Cy08, Cy08, ldc60; +--:-:-:-:1 IADD Cy12, Cy12, ldc60; + +--:-:-:-:1 FMUL cs0, cx00y64, alpha; +--:-:-:-:1 FMUL cs1, cx01y64, alpha; +--:-:-:-:1 FMUL cs2, cx02y64, alpha; +--:-:-:-:1 FMUL cs3, cx03y64, alpha; +--:-:-:-:1 FMUL cs4, cx64y64, alpha; +--:-:-:-:1 FMUL cs5, cx65y64, alpha; +--:-:-:-:1 FMUL cs6, cx66y64, alpha; +--:-:-:-:0 FMUL cs7, cx67y64, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y65, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y65, alpha; +--:-:-:-:1 FMUL cs2, cx02y65, alpha; +--:-:-:-:1 FMUL cs3, cx03y65, alpha; +--:-:-:-:1 FMUL cs4, cx64y65, alpha; +--:-:-:-:1 FMUL cs5, cx65y65, alpha; +--:-:-:-:1 FMUL cs6, cx66y65, alpha; +--:-:-:-:0 FMUL cs7, cx67y65, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y66, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y66, alpha; +--:-:-:-:1 FMUL cs2, cx02y66, alpha; +--:-:-:-:1 FMUL cs3, cx03y66, alpha; +--:-:-:-:1 FMUL cs4, cx64y66, alpha; +--:-:-:-:1 FMUL cs5, cx65y66, alpha; +--:-:-:-:1 FMUL cs6, cx66y66, alpha; +--:-:-:-:0 FMUL cs7, cx67y66, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y67, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y67, alpha; +--:-:-:-:1 FMUL cs2, cx02y67, alpha; +--:-:-:-:1 FMUL cs3, cx03y67, alpha; +--:-:-:-:1 FMUL cs4, cx64y67, alpha; +--:-:-:-:1 FMUL cs5, cx65y67, alpha; +--:-:-:-:1 FMUL cs6, cx66y67, alpha; +--:-:-:-:0 FMUL cs7, cx67y67, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + + +// And we'd done. The remainder is the STORE_C subroutine that's defined at the end of the kernel. +--:-:-:-:5 EXIT; + +// This routine does warp synchronous shuffling of our output data so as to be able +// to have coalesced writes to global memory. This is actually faster because the shared +// memory latencies can be hidden by other warps and we're only adding a few extra clocks +// to this thread. Global memory here is the bottleneck and being able to half the needed +// bandwidth at the expense of a few clocks is a modest win. This also keeps power lower +// and our chip running faster. + +// Note, the SHFL instruction doesn't help us here because we're swaping different registers +// from different threads. +STORE_C: + +--:-:-:-:0 IADD cy00, cy00, 1; +--:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; +--:-:-:-:0 IADD cy04, cy04, 1; +--:-:-:-:1 STS.128 [writeCs+4x<64>], cs4; +--:-:-:-:0 IADD cy08, cy08, 1; +--:-:-:-:1 LDS cs0, [readCs + 4x<0*128 + 00>]; +--:-:-:-:0 IADD cy12, cy12, 1; +--:-:-:-:1 LDS cs1, [readCs + 4x<0*128 + 64>]; +--:-:-:-:0 IADD Cy00, Cy00, ldc1; +--:-:-:-:1 LDS cs2, [readCs + 4x<1*128 + 00>]; +--:-:-:-:0 IADD Cy04, Cy04, ldc1; +--:-:-:-:1 LDS cs3, [readCs + 4x<1*128 + 64>]; +--:-:-:-:0 IADD Cy08, Cy08, ldc1; +--:-:-:-:1 LDS cs4, [readCs + 4x<2*128 + 00>]; +--:-:-:-:0 IADD Cy12, Cy12, ldc1; +--:-:-:-:1 LDS cs5, [readCs + 4x<2*128 + 64>]; +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m +--:-:-:-:1 LDS cs6, [readCs + 4x<3*128 + 00>]; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m +--:-:1:-:1 LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1 +--:-:-:-:2 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m +--:-:-:Y:7 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m +01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m +--:-:-:-:1 @P1 STG.CG [Cy00 + 4x<64>], cs1; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m +--:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m +--:-:-:-:1 @P3 STG.CG [Cy04 + 4x<64>], cs3; +--:-:-:Y:7 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m +--:-:-:-:2 @P0 STG.CG [Cy08 + 4x<00>], cs4; +--:-:-:-:2 @P1 STG.CG [Cy08 + 4x<64>], cs5; +--:-:-:-:2 @P2 STG.CG [Cy12 + 4x<00>], cs6; +--:2:-:-:1 @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2 + +--:-:-:-:5 RET; + diff --git a/Assembler/MaxAs/sgemm/sgemm_pre_64.sass b/Assembler/MaxAs/sgemm/sgemm_pre_64.sass new file mode 100644 index 0000000..aa2719e --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm_pre_64.sass @@ -0,0 +1,867 @@ +# Kernel: sgemm_kernel_64 +# +# SharedSize: 8192 +# Params(8): +# 0:0x140:4:4 param_C, +# 1:0x144:4:0 param_m, +# 2:0x148:4:0 param_n, +# 3:0x14c:4:0 param_k, +# 4:0x150:4:0 param_lda, +# 5:0x154:4:0 param_ldb, +# 6:0x158:4:0 param_ldc +# 7:0x15c:4:0 param_alpha +# 8:0x160:4:4 param_D // for diagnostic printf output +# +# Globals: +# c[0x0][0x164]: texA (the value is 1) +# c[0x0][0x168]: texB (the value is 0) + + + + 0-63 ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end + + 80 : zOffset + 0-63 : cz<00-63> + + 3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35> + 35,34,43,42,51,50,59,58 : cx32y<00-03|32-35> + 39,38,47,46,55,54,63,62 : cx33y<00-03|32-35> + 33,32,41,40,49,48,57,56 : cx34y<00-03|32-35> + 37,36,45,44,53,52,61,60 : cx35y<00-03|32-35> + + 64-79 : j0Ax<00-03|32-35>, j0By<00-03|32-35> + 80-95 : j1Ax<00-03|32-35>, j1By<00-03|32-35> + + 64-71 : cs<0-7> + + 96-111 : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3> + + 112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32 + + 72-111 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX + + + +--:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 +--:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 +--:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 BFE.U32 tid2, tid, 0x104; // 1 bit at position 4 +--:-:-:-:1 MOV k, c[0x0][0x14c]; +--:-:-:-:1 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.AND readBs, tid, 0x30; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tid15_4, tid15, 4; +--:-:-:-:1 LOP.AND zOffset, tid, -32; +--:-:-:-:1 IADD k, k, -8; +--:-:-:-:1 SHL readAs, readAs, 4; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:0 @!P0 MOV ldx4, c[0x0][0x150]; +--:-:-:-:1 STS.128 [zOffset + 4x<16*64>], RZ; +--:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; +--:-:-:-:1 ISCADD writeS, tid2, tid15_4, 8; +06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 +--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA +--:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 SHR.U32 ldx, ldx4, 2; +--:-:-:-:1 LOP.AND tid32, tid, 32; +--:-:-:-:1 ISCADD track0, blk, tid15, 4; +--:-:-:-:1 IADD ldx8, ldx4, ldx4; +--:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*64>; +--:-:-:-:1 ISCADD readBs, readBs, 4x<8*64>, 4; +--:-:-:-:1 XMAD.MRG xmad_t0, ldx, tid2.H1, RZ; +--:-:-:-:1 XMAD.MRG xmad_end, k, ldx.H1, RZ; +--:-:-:Y:6 XMAD track0, ldx, tid2, track0; +--:-:-:-:2 XMAD.PSL.CBCC track0, ldx.H1, xmad_t0.H1, track0; +--:-:1:-:4 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:-:-:1 IADD3 track2, track0, ldx, ldx; +--:-:-:-:1 IADD track4, track0, ldx4; +--:-:2:-:1 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:-:0 XMAD end, k, ldx, track0; +--:-:3:-:3 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:-:-:2 IADD track6, track2, ldx4; +--:-:4:-:1 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:-:1 XMAD.PSL.CBCC end, k.H1, xmad_end.H1, end; + +--:-:5:-:1 LDS.U.128 cz00, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz04, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz08, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz12, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz16, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz20, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz24, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz28, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz32, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz36, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz40, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz44, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz48, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz52, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz56, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz60, [zOffset + 4x<16*64>]; + +01:-:-:-:1 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1 +--:-:-:-:0 IADD track0, track0, ldx8; +02:-:-:-:1 STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2 +--:-:-:-:0 IADD track2, track2, ldx8; +04:-:-:-:1 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3 +--:-:-:-:0 IADD track4, track4, ldx8; +08:-:-:-:1 STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4 +--:-:-:-:0 IADD track6, track6, ldx8; +10:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*64>; + +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1 + +// Efficiency: +// ffma: 512 +// lds: 32 dual issued +// sts: 4 dual issued +// tex: 4 dual issued +// add: 4 +// xor: 3 +// setp: 1 +// bar: 1 dual issued +// bra: 1 dual issued +// Total: 520 (512/520 = 98.5% FFMA) + +LOOP: + +// Loop end condition +--:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; + +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<1*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<1*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax32, [readAs + 4x<1*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By32, [readBs + 4x<1*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j0Ax34, j0By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j0Ax34, j0By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j0Ax32, j0By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j0Ax32, j0By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j0Ax35, j0By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j0Ax35, j0By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j0Ax33, j0By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j0Ax33, j0By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j0Ax35, j0By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j0Ax35, j0By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j0Ax33, j0By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j0Ax33, j0By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j0Ax34, j0By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j0Ax34, j0By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j0Ax32, j0By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j0Ax32, j0By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:0 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:-:1 @P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA cx02y32, j0Ax02, j0By32, cx02y32; +--:-:-:-:0 FFMA cx02y33, j0Ax02, j0By33, cx02y33; +--:-:2:-:1 @P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:-:1 FFMA cx00y33, j0Ax00, j0By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j0Ax00, j0By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j0Ax03, j0By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j0Ax03, j0By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j0Ax01, j0By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j0Ax01, j0By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j0Ax34, j0By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j0Ax34, j0By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j0Ax32, j0By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j0Ax32, j0By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j0Ax35, j0By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j0Ax35, j0By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j0Ax33, j0By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j0Ax33, j0By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j0Ax35, j0By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j0Ax35, j0By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j0Ax33, j0By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j0Ax33, j0By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j0Ax34, j0By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j0Ax34, j0By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j0Ax32, j0By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j0Ax32, j0By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j0Ax03, j0By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j0Ax03, j0By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j0Ax01, j0By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j0Ax01, j0By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j0Ax02, j0By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j0Ax02, j0By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j0Ax00, j0By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j0Ax00, j0By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<2*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<2*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<2*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<2*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j1Ax34, j1By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j1Ax34, j1By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j1Ax32, j1By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j1Ax32, j1By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j1Ax35, j1By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j1Ax35, j1By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j1Ax33, j1By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j1Ax33, j1By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j1Ax35, j1By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j1Ax35, j1By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j1Ax33, j1By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j1Ax33, j1By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j1Ax34, j1By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j1Ax34, j1By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j1Ax32, j1By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j1Ax32, j1By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:0 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:-:1 @P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA cx02y32, j1Ax02, j1By32, cx02y32; +--:-:-:-:0 FFMA cx02y33, j1Ax02, j1By33, cx02y33; +--:-:3:-:1 @P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3 +--:-:-:-:1 FFMA cx00y33, j1Ax00, j1By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j1Ax00, j1By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j1Ax03, j1By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j1Ax03, j1By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j1Ax01, j1By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j1Ax01, j1By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j1Ax34, j1By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j1Ax34, j1By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j1Ax32, j1By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j1Ax32, j1By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j1Ax35, j1By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j1Ax35, j1By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j1Ax33, j1By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j1Ax33, j1By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j1Ax35, j1By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j1Ax35, j1By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j1Ax33, j1By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j1Ax33, j1By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j1Ax34, j1By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j1Ax34, j1By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j1Ax32, j1By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j1Ax32, j1By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j1Ax03, j1By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j1Ax03, j1By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j1Ax01, j1By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j1Ax01, j1By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j1Ax02, j1By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j1Ax02, j1By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j1Ax00, j1By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j1Ax00, j1By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<3*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<3*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax32, [readAs + 4x<3*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By32, [readBs + 4x<3*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j0Ax34, j0By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j0Ax34, j0By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j0Ax32, j0By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j0Ax32, j0By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j0Ax35, j0By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j0Ax35, j0By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j0Ax33, j0By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j0Ax33, j0By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j0Ax35, j0By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j0Ax35, j0By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j0Ax33, j0By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j0Ax33, j0By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j0Ax34, j0By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j0Ax34, j0By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j0Ax32, j0By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j0Ax32, j0By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j0Ax02, j0By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j0Ax02, j0By33, cx02y33; +--:-:-:-:1 FFMA cx00y33, j0Ax00, j0By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j0Ax00, j0By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j0Ax03, j0By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j0Ax03, j0By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j0Ax01, j0By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j0Ax01, j0By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j0Ax34, j0By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j0Ax34, j0By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j0Ax32, j0By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j0Ax32, j0By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j0Ax35, j0By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j0Ax35, j0By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j0Ax33, j0By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j0Ax33, j0By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j0Ax35, j0By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j0Ax35, j0By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j0Ax33, j0By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j0Ax33, j0By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j0Ax34, j0By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j0Ax34, j0By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j0Ax32, j0By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j0Ax32, j0By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j0Ax03, j0By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j0Ax03, j0By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j0Ax01, j0By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j0Ax01, j0By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j0Ax02, j0By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j0Ax02, j0By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j0Ax00, j0By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j0Ax00, j0By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<4*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<4*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<4*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<4*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j1Ax34, j1By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j1Ax34, j1By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j1Ax32, j1By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j1Ax32, j1By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j1Ax35, j1By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j1Ax35, j1By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j1Ax33, j1By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j1Ax33, j1By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j1Ax35, j1By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j1Ax35, j1By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j1Ax33, j1By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j1Ax33, j1By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j1Ax34, j1By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j1Ax34, j1By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j1Ax32, j1By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j1Ax32, j1By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j1Ax02, j1By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j1Ax02, j1By33, cx02y33; +--:-:-:-:1 FFMA cx00y33, j1Ax00, j1By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j1Ax00, j1By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j1Ax03, j1By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j1Ax03, j1By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j1Ax01, j1By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j1Ax01, j1By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j1Ax34, j1By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j1Ax34, j1By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j1Ax32, j1By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j1Ax32, j1By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j1Ax35, j1By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j1Ax35, j1By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j1Ax33, j1By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j1Ax33, j1By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j1Ax35, j1By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j1Ax35, j1By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j1Ax33, j1By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j1Ax33, j1By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j1Ax34, j1By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j1Ax34, j1By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j1Ax32, j1By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j1Ax32, j1By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j1Ax03, j1By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j1Ax03, j1By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j1Ax01, j1By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j1Ax01, j1By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j1Ax02, j1By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j1Ax02, j1By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j1Ax00, j1By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j1Ax00, j1By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<5*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<5*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax32, [readAs + 4x<5*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By32, [readBs + 4x<5*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j0Ax34, j0By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j0Ax34, j0By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j0Ax32, j0By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j0Ax32, j0By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j0Ax35, j0By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j0Ax35, j0By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j0Ax33, j0By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j0Ax33, j0By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j0Ax35, j0By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j0Ax35, j0By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j0Ax33, j0By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j0Ax33, j0By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j0Ax34, j0By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j0Ax34, j0By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j0Ax32, j0By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j0Ax32, j0By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j0Ax02, j0By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j0Ax02, j0By33, cx02y33; +--:-:-:-:1 FFMA cx00y33, j0Ax00, j0By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j0Ax00, j0By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j0Ax03, j0By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j0Ax03, j0By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j0Ax01, j0By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j0Ax01, j0By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j0Ax34, j0By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j0Ax34, j0By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j0Ax32, j0By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j0Ax32, j0By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j0Ax35, j0By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j0Ax35, j0By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j0Ax33, j0By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j0Ax33, j0By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j0Ax35, j0By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j0Ax35, j0By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j0Ax33, j0By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j0Ax33, j0By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j0Ax34, j0By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j0Ax34, j0By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j0Ax32, j0By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j0Ax32, j0By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j0Ax03, j0By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j0Ax03, j0By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j0Ax01, j0By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j0Ax01, j0By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j0Ax02, j0By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j0Ax02, j0By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j0Ax00, j0By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j0Ax00, j0By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<6*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<6*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<6*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<6*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j1Ax34, j1By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j1Ax34, j1By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j1Ax32, j1By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j1Ax32, j1By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j1Ax35, j1By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j1Ax35, j1By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j1Ax33, j1By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j1Ax33, j1By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j1Ax35, j1By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j1Ax35, j1By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j1Ax33, j1By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j1Ax33, j1By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j1Ax34, j1By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j1Ax34, j1By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j1Ax32, j1By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j1Ax32, j1By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:0 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +02:-:-:-:1 @P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2 +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j1Ax02, j1By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j1Ax02, j1By33, cx02y33; +--:-:-:-:0 FFMA cx00y33, j1Ax00, j1By33, cx00y33; +--:-:-:-:1 @P0 STS.128 [writeS + 4x<2*64>], loadX2; +--:-:-:-:1 FFMA cx00y32, j1Ax00, j1By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j1Ax03, j1By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j1Ax03, j1By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j1Ax01, j1By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j1Ax01, j1By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j1Ax34, j1By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j1Ax34, j1By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j1Ax32, j1By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j1Ax32, j1By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j1Ax35, j1By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j1Ax35, j1By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j1Ax33, j1By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j1Ax33, j1By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j1Ax35, j1By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j1Ax35, j1By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j1Ax33, j1By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j1Ax33, j1By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j1Ax34, j1By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j1Ax34, j1By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j1Ax32, j1By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j1Ax32, j1By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j1Ax03, j1By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j1Ax03, j1By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j1Ax01, j1By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j1Ax01, j1By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j1Ax02, j1By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j1Ax02, j1By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j1Ax00, j1By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j1Ax00, j1By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<7*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<7*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax32, [readAs + 4x<7*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By32, [readBs + 4x<7*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j0Ax34, j0By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j0Ax34, j0By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j0Ax32, j0By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j0Ax32, j0By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j0Ax35, j0By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j0Ax35, j0By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j0Ax33, j0By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j0Ax33, j0By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j0Ax35, j0By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j0Ax35, j0By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j0Ax33, j0By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j0Ax33, j0By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j0Ax34, j0By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j0Ax34, j0By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j0Ax32, j0By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j0Ax32, j0By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:0 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +04:-:-:-:1 @P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3 +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j0Ax02, j0By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j0Ax02, j0By33, cx02y33; +--:-:-:-:0 FFMA cx00y33, j0Ax00, j0By33, cx00y33; +--:-:-:-:1 @P0 STS.128 [writeS + 4x<6*64>], loadX6; +--:-:-:-:1 FFMA cx00y32, j0Ax00, j0By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j0Ax03, j0By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j0Ax03, j0By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j0Ax01, j0By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j0Ax01, j0By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j0Ax34, j0By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j0Ax34, j0By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j0Ax32, j0By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j0Ax32, j0By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j0Ax35, j0By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j0Ax35, j0By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j0Ax33, j0By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j0Ax33, j0By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j0Ax35, j0By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j0Ax35, j0By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j0Ax33, j0By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j0Ax33, j0By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j0Ax34, j0By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j0Ax34, j0By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j0Ax32, j0By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j0Ax32, j0By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j0Ax03, j0By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j0Ax03, j0By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j0Ax01, j0By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j0Ax01, j0By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j0Ax02, j0By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j0Ax02, j0By35, cx02y35; +--:-:-:-:0 FFMA cx00y35, j0Ax00, j0By35, cx00y35; +01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1 +--:-:-:-:1 @P0 LOP.XOR readAs, readAs, 4x<16*64>; +--:-:-:-:1 @P0 LOP.XOR readBs, readBs, 4x<16*64>; +--:-:-:-:1 @P0 LOP.XOR writeS, writeS, 4x<16*64>; +--:-:-:-:1 FFMA cx00y34, j0Ax00, j0By34, cx00y34; +--:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; +--:-:-:-:1 @P0 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 @P0 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 @P0 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 @P0 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j1Ax34, j1By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j1Ax34, j1By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j1Ax32, j1By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j1Ax32, j1By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j1Ax35, j1By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j1Ax35, j1By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j1Ax33, j1By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j1Ax33, j1By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j1Ax35, j1By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j1Ax35, j1By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j1Ax33, j1By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j1Ax33, j1By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j1Ax34, j1By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j1Ax34, j1By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j1Ax32, j1By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j1Ax32, j1By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j1Ax02, j1By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j1Ax02, j1By33, cx02y33; +--:-:-:-:1 FFMA cx00y33, j1Ax00, j1By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j1Ax00, j1By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j1Ax03, j1By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j1Ax03, j1By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j1Ax01, j1By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j1Ax01, j1By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j1Ax34, j1By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j1Ax34, j1By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j1Ax32, j1By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j1Ax32, j1By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j1Ax35, j1By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j1Ax35, j1By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j1Ax33, j1By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j1Ax33, j1By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j1Ax35, j1By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j1Ax35, j1By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j1Ax33, j1By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j1Ax33, j1By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j1Ax34, j1By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j1Ax34, j1By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j1Ax32, j1By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j1Ax32, j1By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j1Ax03, j1By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j1Ax03, j1By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j1Ax01, j1By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j1Ax01, j1By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j1Ax02, j1By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j1Ax02, j1By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j1Ax00, j1By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j1Ax00, j1By34, cx00y34; +--:-:-:-:1 @P0 IADD track0, track0, ldx8; +--:-:-:-:1 @P0 IADD track2, track2, ldx8; +--:-:-:-:1 @P0 IADD track4, track4, ldx8; +--:-:-:-:0 @P0 IADD track6, track6, ldx8; +--:-:-:Y:5 @P0 BRA LOOP; + +--:-:-:-:1 SHR.U32 cy00, tid32, 1; +--:-:-:-:1 MOV ldc, c[0x0][0x158]; +--:-:-:-:1 ISCADD cx, bx, tid31, 6; +--:-:-:-:1 MOV alpha, c[0x0][0x15c]; +--:-:-:-:1 ISCADD readCs, tid32, tid31, 3; +--:-:-:-:1 LOP.AND readAs, readAs, 0x7ff; +--:-:-:-:1 ISCADD cy00, by, cy00, 6; +--:-:-:-:1 LOP.AND readBs, readBs, 0x7ff; +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 FMUL cs0, cx00y00, alpha; +--:-:-:-:1 SHL ldc8, ldc, 5; +--:-:-:-:1 XMAD.MRG xmad_ci, cy00, ldc.H1, RZ; +--:-:-:-:1 ISCADD writeCs, readBs, readAs, 4; +--:-:-:-:1 XMAD ci, cy00, ldc, cx; +--:-:-:-:1 SHL readCs, readCs, 2; +--:-:-:-:1 IADD cx, cx, 32; +--:-:-:-:1 ISCADD ldc28, ldc, -ldc4, 7; +--:-:-:-:1 FMUL cs1, cx01y00, alpha; +--:-:-:-:1 FMUL cs2, cx02y00, alpha; +--:-:-:-:1 XMAD.PSL.CBCC ci, cy00.H1, xmad_ci.H1, ci; +--:-:-:-:1 IADD cy00, cy00, -1; +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m +--:-:-:-:1 FMUL cs3, cx03y00, alpha; +--:-:-:-:1 FMUL cs4, cx32y00, alpha; +--:-:-:-:1 FMUL cs5, cx33y00, alpha; +--:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:1 FMUL cs6, cx34y00, alpha; +--:-:-:-:1 FMUL cs7, cx35y00, alpha; +--:-:-:Y:6 IADD Cy00, Cy00, -ldc1; +--:-:-:-:1 IADD Cy04, Cy00, ldc4; +--:-:-:Y:5 IADD Cy08, Cy00, ldc8; +--:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) + +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y01, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y01, alpha; +--:-:-:-:1 FMUL cs2, cx02y01, alpha; +--:-:-:-:1 FMUL cs3, cx03y01, alpha; +--:-:-:-:1 FMUL cs4, cx32y01, alpha; +--:-:-:-:1 FMUL cs5, cx33y01, alpha; +--:-:-:-:1 FMUL cs6, cx34y01, alpha; +--:-:-:-:0 FMUL cs7, cx35y01, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y02, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y02, alpha; +--:-:-:-:1 FMUL cs2, cx02y02, alpha; +--:-:-:-:1 FMUL cs3, cx03y02, alpha; +--:-:-:-:1 FMUL cs4, cx32y02, alpha; +--:-:-:-:1 FMUL cs5, cx33y02, alpha; +--:-:-:-:1 FMUL cs6, cx34y02, alpha; +--:-:-:-:0 FMUL cs7, cx35y02, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y03, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y03, alpha; +--:-:-:-:1 FMUL cs2, cx02y03, alpha; +--:-:-:-:1 FMUL cs3, cx03y03, alpha; +--:-:-:-:1 FMUL cs4, cx32y03, alpha; +--:-:-:-:1 FMUL cs5, cx33y03, alpha; +--:-:-:-:1 FMUL cs6, cx34y03, alpha; +--:-:-:-:0 FMUL cs7, cx35y03, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 IADD cy00, cy00, 28; +--:-:-:-:1 IADD cy04, cy04, 28; +--:-:-:-:1 IADD cy08, cy08, 28; +--:-:-:-:1 IADD cy12, cy12, 28; + +02:-:-:-:1 IADD Cy00, Cy00, ldc28; // Wait Dep 2 +--:-:-:-:1 IADD Cy04, Cy04, ldc28; +--:-:-:-:1 IADD Cy08, Cy08, ldc28; +--:-:-:-:1 IADD Cy12, Cy12, ldc28; + +--:-:-:-:1 FMUL cs0, cx00y32, alpha; +--:-:-:-:1 FMUL cs1, cx01y32, alpha; +--:-:-:-:1 FMUL cs2, cx02y32, alpha; +--:-:-:-:1 FMUL cs3, cx03y32, alpha; +--:-:-:-:1 FMUL cs4, cx32y32, alpha; +--:-:-:-:1 FMUL cs5, cx33y32, alpha; +--:-:-:-:1 FMUL cs6, cx34y32, alpha; +--:-:-:-:0 FMUL cs7, cx35y32, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y33, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y33, alpha; +--:-:-:-:1 FMUL cs2, cx02y33, alpha; +--:-:-:-:1 FMUL cs3, cx03y33, alpha; +--:-:-:-:1 FMUL cs4, cx32y33, alpha; +--:-:-:-:1 FMUL cs5, cx33y33, alpha; +--:-:-:-:1 FMUL cs6, cx34y33, alpha; +--:-:-:-:0 FMUL cs7, cx35y33, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y34, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y34, alpha; +--:-:-:-:1 FMUL cs2, cx02y34, alpha; +--:-:-:-:1 FMUL cs3, cx03y34, alpha; +--:-:-:-:1 FMUL cs4, cx32y34, alpha; +--:-:-:-:1 FMUL cs5, cx33y34, alpha; +--:-:-:-:1 FMUL cs6, cx34y34, alpha; +--:-:-:-:0 FMUL cs7, cx35y34, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y35, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y35, alpha; +--:-:-:-:1 FMUL cs2, cx02y35, alpha; +--:-:-:-:1 FMUL cs3, cx03y35, alpha; +--:-:-:-:1 FMUL cs4, cx32y35, alpha; +--:-:-:-:1 FMUL cs5, cx33y35, alpha; +--:-:-:-:1 FMUL cs6, cx34y35, alpha; +--:-:-:-:0 FMUL cs7, cx35y35, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:0 IADD cy00, cy00, 1; +--:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; +--:-:-:-:0 IADD cy04, cy04, 1; +--:-:-:-:1 STS.128 [writeCs+4x<32>], cs4; +--:-:-:-:0 IADD cy08, cy08, 1; +--:-:-:-:1 LDS cs0, [readCs + 4x<0*64 + 00>]; +--:-:-:-:0 IADD cy12, cy12, 1; +--:-:-:-:1 LDS cs1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:0 IADD Cy00, Cy00, ldc1; +--:-:-:-:1 LDS cs2, [readCs + 4x<1*64 + 00>]; +--:-:-:-:0 IADD Cy04, Cy04, ldc1; +--:-:-:-:1 LDS cs3, [readCs + 4x<1*64 + 32>]; +--:-:-:-:0 IADD Cy08, Cy08, ldc1; +--:-:-:-:1 LDS cs4, [readCs + 4x<2*64 + 00>]; +--:-:-:-:0 IADD Cy12, Cy12, ldc1; +--:-:-:-:1 LDS cs5, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m +--:-:-:-:1 LDS cs6, [readCs + 4x<3*64 + 00>]; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m +--:-:1:-:1 LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1 +--:-:-:-:2 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m +--:-:-:Y:7 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m +01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m +--:-:-:-:1 @P1 STG.CG [Cy00 + 4x<32>], cs1; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m +--:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m +--:-:-:-:1 @P3 STG.CG [Cy04 + 4x<32>], cs3; +--:-:-:Y:7 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m +--:-:-:-:2 @P0 STG.CG [Cy08 + 4x<00>], cs4; +--:-:-:-:2 @P1 STG.CG [Cy08 + 4x<32>], cs5; +--:-:-:-:2 @P2 STG.CG [Cy12 + 4x<00>], cs6; +--:2:-:-:1 @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2 + +--:-:-:-:5 RET; + diff --git a/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin b/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin new file mode 100644 index 0000000..0c7825f Binary files /dev/null and b/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin differ diff --git a/Assembler/MaxAs/sgemm/sgemm_sm52_64_dump.sass b/Assembler/MaxAs/sgemm/sgemm_sm52_64_dump.sass new file mode 100644 index 0000000..552d95b --- /dev/null +++ b/Assembler/MaxAs/sgemm/sgemm_sm52_64_dump.sass @@ -0,0 +1,1100 @@ + + code for sm_52 + Function : sgemm_kernel_128 + .headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)" + /* 0x001ffc00e22007f6 */ + /*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */ + /*0010*/ S2R R8, SR_TID.X; /* 0xf0c8000002170008 */ + /*0018*/ SSY 0x90; /* 0xe290000007000000 */ + /* 0x001fc400ffa00fed */ + /*0028*/ ISETP.GT.AND P0, PT, R8, 0x7f, PT; /* 0x3669038007f70807 */ + /*0030*/ @!P0 BRA 0x60; /* 0xe24000000288000f */ + /*0038*/ MOV R0, c[0x0][0x170]; /* 0x4c98078005c70000 */ + /* 0x001ff400fe0007f5 */ + /*0048*/ MOV32I R3, 0x20000000; /* 0x010200000007f003 */ + /*0050*/ { LOP32I.OR R2, R0, 0x80000000; /* 0x0428000000070002 */ + /*0058*/ SYNC; } /* 0xf0f800000007000f */ + /* 0x001fc000fea007f1 */ + /*0068*/ MOV R0, c[0x0][0x174]; /* 0x4c98078005d70000 */ + /*0070*/ MOV32I R3, 0x20000000; /* 0x010200000007f003 */ + /*0078*/ { LOP32I.OR R2, R0, 0x80000000; /* 0x0428000000070002 */ + /*0088*/ SYNC; } /* 0x001fd0800e2007fd */ + /* 0xf0f800000007000f */ + /*0090*/ TLD.B.LZ.NODEP.P R4, R8, R2, 0x0, 1D, 0xf; /* 0xdd3a000780270804 */ + /*0098*/ SHL R0, R8, 0x4; /* 0x3848000000470800 */ + /* 0x081fc403ffe041f2 */ + /*00a8*/ STS.128 [R0], R4; /* 0xef5e000000070004 */ + /*00b0*/ BAR.SYNC 0x0; /* 0xf0a81b8000070000 */ + /*00b8*/ IADD32I R0, -R8.reuse, 0xff; /* 0x1d0000000ff70800 */ + /* 0x001fc000fe8207f5 */ + /*00c8*/ SHL R2, R8.reuse, 0x2; /* 0x3848000000270802 */ + /*00d0*/ SHL R0, R0, 0x4; /* 0x3848000000470000 */ + /*00d8*/ { IADD R4.CC, R2, c[0x0][0x140]; /* 0x4c10800005070204 */ + /*00e8*/ LDS.U.32 R0, [R0]; } /* 0x001fc400fec00711 */ + /* 0xef4c100000070000 */ + /*00f0*/ SHR R2, R8, 0x1e; /* 0x3829000001e70802 */ + /*00f8*/ IADD.X R3, R2, c[0x0][0x144]; /* 0x4c10080005170203 */ + /* 0x001ffc011e2007ff */ + /*0108*/ MOV R2, R4; /* 0x5c98078000470002 */ + /*0110*/ STG.E [R2], R0; /* 0xeedc200000070200 */ + /*0118*/ EXIT; /* 0xe30000000007000f */ + /* 0x001f8000fc0007ff */ + /*0128*/ BRA 0x120; /* 0xe2400fffff07000f */ + /*0130*/ NOP; /* 0x50b0000000070f00 */ + /*0138*/ NOP; /* 0x50b0000000070f00 */ + ................................. + + + Function : sgemm_kernel_64 + .headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)" + /* 0x001d4400e6200711 */ + /*0008*/ S2R R119, SR_TID.X; /* 0xf0c8000002170077 */ + /*0010*/ S2R R125, SR_CTAID.X; /* 0xf0c800000257007d */ + /*0018*/ S2R R122, SR_CTAID.Y; /* 0xf0c800000267007a */ + /* 0x081fc440fe220ff1 */ + /*0028*/ ISETP.GE.AND P0, PT, R119.reuse, 0x20, PT; /* 0x366d038002077707 */ + /*0030*/ LOP.AND R9, R119.reuse, 0xf; /* 0x3847000000f77709 */ + /*0038*/ BFE.U32 R4, R119.reuse, 0x104; /* 0x3800000010477704 */ + /* 0x081fc440fe2007f1 */ + /*0048*/ MOV R12, c[0x0][0x14c]; /* 0x4c9807800537000c */ + /*0050*/ BFE.U32 R114, R119.reuse, 0x301; /* 0x3800000030177772 */ + /*0058*/ LOP.AND R115, R119.reuse, 0x30; /* 0x3847000003077773 */ + /* 0x081fc400fe2207f1 */ + /*0068*/ LOP.AND R0, R119.reuse, 0x1; /* 0x3847000000177700 */ + /*0070*/ SHL R13, R9, 0x4; /* 0x384800000047090d */ + /*0078*/ LOP.AND R80, R119.reuse, -0x20; /* 0x3947007ffe077750 */ + /* 0x081fc400fe2007f1 */ + /*0088*/ IADD R12, R12, -0x8; /* 0x3910007fff870c0c */ + /*0090*/ SHL R114, R114, 0x4; /* 0x3848000000477272 */ + /*0098*/ LOP.AND R126, R119.reuse, 0x1f; /* 0x3847000001f7777e */ + /* 0x001fc400fe2007f0 */ + /*00a8*/ { SHR.U32 R115, R115, 0x3; /* 0x3828000000377373 */ + /*00b0*/ STS.128 [R80+0x1000], RZ; } /* 0xef5e0001000750ff */ + /*00b8*/ @!P0 MOV R2, c[0x0][0x150]; /* 0x4c98078005480002 */ + /* 0x00dfc400fe2007f1 */ + /*00c8*/ ISCADD R118, R4, R13, 0x8; /* 0x5c18040000d70476 */ + /*00d0*/ @P0 MOV R2, c[0x0][0x154]; /* 0x4c98078005500002 */ + /*00d8*/ SEL R8, R122, R125, P0; /* 0x5ca0000007d77a08 */ + /* 0x001fc400fe2007f1 */ + /*00e8*/ @!P0 MOV32I R113, 0x80000001; /* 0x010800000018f071 */ + /*00f0*/ @P0 MOV32I R113, 0x80000000; /* 0x010800000000f071 */ + /*00f8*/ LOP.OR R115, R115, R0; /* 0x5c47020000077373 */ + /* 0x001fc440fe2007f1 */ + /*0108*/ LOP.AND R123, R119, 0x20; /* 0x384700000207777b */ + /*0110*/ SHR.U32 R1, R2.reuse, 0x2; /* 0x3828000000270201 */ + /*0118*/ IADD R121, R2, R2; /* 0x5c10000000270279 */ + /* 0x001fc800fe2007f1 */ + /*0128*/ ISCADD R112, R8, R9, 0x4; /* 0x5c18020000970870 */ + /*0130*/ @P0 IADD R118, R118, 0x800; /* 0x3810000080007676 */ + /*0138*/ ISCADD R115, R115, 0x800, 0x4; /* 0x3818020080077373 */ + /* 0x081f98c0fe2607f1 */ + /*0148*/ XMAD.MRG R5, R1.reuse, R4.H1.reuse, RZ; /* 0x5b007fa800470105 */ + /*0150*/ XMAD.MRG R16, R12.reuse, R1.H1.reuse, RZ; /* 0x5b007fa800170c10 */ + /*0158*/ XMAD R112, R1.reuse, R4, R112; /* 0x5b00380000470170 */ + /* 0x181fc480e28007f2 */ + /*0168*/ XMAD.PSL.CBCC R112, R1.H1, R5.H1, R112; /* 0x5b30381800570170 */ + /*0170*/ TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf; /* 0xdd38000787177060 */ + /*0178*/ IADD3 R116, R112.reuse, R1.reuse, R1; /* 0x5cc0008000177074 */ + /* 0x081fc080e62407f1 */ + /*0188*/ IADD R120, R112, R2.reuse; /* 0x5c10000000277078 */ + /*0190*/ TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf; /* 0xdd38000787177464 */ + /*0198*/ { XMAD R117, R12.reuse, R1, R112; /* 0x5b00380000170c75 */ + /*01a8*/ TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf; } /* 0x101dc400fe440753 */ + /* 0xdd38000787177868 */ + /*01b0*/ IADD R124, R116, R2; /* 0x5c1000000027747c */ + /*01b8*/ TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf; /* 0xdd38000787177c6c */ + /* 0x001e4400f22007f1 */ + /*01c8*/ XMAD.PSL.CBCC R117, R12.H1, R16.H1, R117; /* 0x5b303a9801070c75 */ + /*01d0*/ LDS.U.128 R0, [R80+0x1000]; /* 0xef4e100100075000 */ + /*01d8*/ LDS.U.128 R4, [R80+0x1000]; /* 0xef4e100100075004 */ + /* 0x001e4400f2200791 */ + /*01e8*/ LDS.U.128 R8, [R80+0x1000]; /* 0xef4e100100075008 */ + /*01f0*/ LDS.U.128 R12, [R80+0x1000]; /* 0xef4e10010007500c */ + /*01f8*/ LDS.U.128 R16, [R80+0x1000]; /* 0xef4e100100075010 */ + /* 0x001e4400f2200791 */ + /*0208*/ LDS.U.128 R20, [R80+0x1000]; /* 0xef4e100100075014 */ + /*0210*/ LDS.U.128 R24, [R80+0x1000]; /* 0xef4e100100075018 */ + /*0218*/ LDS.U.128 R28, [R80+0x1000]; /* 0xef4e10010007501c */ + /* 0x001e4400f2200791 */ + /*0228*/ LDS.U.128 R32, [R80+0x1000]; /* 0xef4e100100075020 */ + /*0230*/ LDS.U.128 R36, [R80+0x1000]; /* 0xef4e100100075024 */ + /*0238*/ LDS.U.128 R40, [R80+0x1000]; /* 0xef4e100100075028 */ + /* 0x001e4400f2200791 */ + /*0248*/ LDS.U.128 R44, [R80+0x1000]; /* 0xef4e10010007502c */ + /*0250*/ LDS.U.128 R48, [R80+0x1000]; /* 0xef4e100100075030 */ + /*0258*/ LDS.U.128 R52, [R80+0x1000]; /* 0xef4e100100075034 */ + /* 0x003fc400f2200791 */ + /*0268*/ LDS.U.128 R56, [R80+0x1000]; /* 0xef4e100100075038 */ + /*0270*/ LDS.U.128 R60, [R80+0x1000]; /* 0xef4e10010007503c */ + /*0278*/ STS.128 [R118], R96; /* 0xef5e000000077660 */ + /* 0x101fc002fe2407f0 */ + /*0288*/ { IADD R112, R112, R121.reuse; /* 0x5c10000007977070 */ + /*0290*/ STS.128 [R118+0x200], R100; } /* 0xef5e000020077664 */ + /*0298*/ { IADD R116, R116, R121.reuse; /* 0x5c10000007977474 */ + /*02a8*/ STS.128 [R118+0x400], R104; } /* 0x011fc480fe0027f1 */ + /* 0xef5e000040077668 */ + /*02b0*/ { IADD R120, R120, R121.reuse; /* 0x5c10000007977878 */ + /*02b8*/ STS.128 [R118+0x600], R108; } /* 0xef5e00006007766c */ + /* 0x001fc010fea007f0 */ + /*02c8*/ { IADD R124, R124, R121; /* 0x5c10000007977c7c */ + /*02d0*/ BAR.SYNC 0x0; } /* 0xf0a81b8000070000 */ + /*02d8*/ { LOP.XOR R118, R118, 0x1000; /* 0x3847040100077676 */ + /*02e8*/ LDS.U.128 R64, [R114]; } /* 0x001fc400fe2007f1 */ + /* 0xef4e100000077240 */ + /*02f0*/ LDS.U.128 R72, [R115]; /* 0xef4e100000077348 */ + /*02f8*/ LDS.U.128 R68, [R114+0x80]; /* 0xef4e100008077244 */ + /* 0x183fc000fe200711 */ + /*0308*/ LDS.U.128 R76, [R115+0x80]; /* 0xef4e10000807734c */ + /*0310*/ ISETP.LE.AND P0, PT, R112, R117, PT; /* 0x5b67038007577007 */ + /*0318*/ { FFMA R1, R66.reuse, R72.reuse, R1; /* 0x5980008004874201 */ + /*0328*/ LDS.U.128 R80, [R114+0x100]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100010077250 */ + /*0330*/ FFMA R0, R66, R73.reuse, R0; /* 0x5980000004974200 */ + /*0338*/ { FFMA R2, R64.reuse, R73.reuse, R2; /* 0x5980010004974002 */ + /*0348*/ LDS.U.128 R88, [R115+0x100]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100010077358 */ + /*0350*/ FFMA R3, R64, R72.reuse, R3; /* 0x5980018004874003 */ + /*0358*/ { FFMA R5, R67.reuse, R72.reuse, R5; /* 0x5980028004874305 */ + /*0368*/ LDS.U.128 R84, [R114+0x180]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100018077254 */ + /*0370*/ FFMA R4, R67, R73.reuse, R4; /* 0x5980020004974304 */ + /*0378*/ { FFMA R6, R65.reuse, R73.reuse, R6; /* 0x5980030004974106 */ + /*0388*/ LDS.U.128 R92, [R115+0x180]; } /* 0x181fc480fe200711 */ + /* 0xef4e10001807735c */ + /*0390*/ FFMA R7, R65, R72.reuse, R7; /* 0x5980038004874107 */ + /*0398*/ FFMA R33, R70.reuse, R72.reuse, R33; /* 0x5980108004874621 */ + /* 0x101fc4c0fe2407f1 */ + /*03a8*/ FFMA R32, R70, R73.reuse, R32; /* 0x5980100004974620 */ + /*03b0*/ FFMA R34, R68.reuse, R73.reuse, R34; /* 0x5980110004974422 */ + /*03b8*/ FFMA R35, R68, R72.reuse, R35; /* 0x5980118004874423 */ + /* 0x081fc4c0fe2607f1 */ + /*03c8*/ FFMA R37, R71.reuse, R72.reuse, R37; /* 0x5980128004874725 */ + /*03d0*/ FFMA R36, R71.reuse, R73.reuse, R36; /* 0x5980120004974724 */ + /*03d8*/ FFMA R38, R69.reuse, R73, R38; /* 0x5980130004974526 */ + /* 0x101fc4c0fe2207f1 */ + /*03e8*/ FFMA R39, R69.reuse, R72, R39; /* 0x5980138004874527 */ + /*03f0*/ FFMA R45, R71.reuse, R74.reuse, R45; /* 0x5980168004a7472d */ + /*03f8*/ FFMA R44, R71, R75.reuse, R44; /* 0x5980160004b7472c */ + /* 0x181fc480fe2607f1 */ + /*0408*/ FFMA R46, R69.reuse, R75.reuse, R46; /* 0x5980170004b7452e */ + /*0410*/ FFMA R47, R69, R74.reuse, R47; /* 0x5980178004a7452f */ + /*0418*/ FFMA R41, R70.reuse, R74.reuse, R41; /* 0x5980148004a74629 */ + /* 0x101fc4c0fe2407f1 */ + /*0428*/ FFMA R40, R70, R75.reuse, R40; /* 0x5980140004b74628 */ + /*0430*/ FFMA R42, R68.reuse, R75.reuse, R42; /* 0x5980150004b7442a */ + /*0438*/ FFMA R43, R68, R74.reuse, R43; /* 0x5980158004a7442b */ + /* 0x181fc480fe2607f1 */ + /*0448*/ FFMA R13, R67.reuse, R74.reuse, R13; /* 0x5980068004a7430d */ + /*0450*/ FFMA R12, R67, R75.reuse, R12; /* 0x5980060004b7430c */ + /*0458*/ FFMA R14, R65.reuse, R75.reuse, R14; /* 0x5980070004b7410e */ + /* 0x181fc4c0fe2407f1 */ + /*0468*/ FFMA R15, R65, R74.reuse, R15; /* 0x5980078004a7410f */ + /*0470*/ FFMA R9, R66.reuse, R74.reuse, R9; /* 0x5980048004a74209 */ + /*0478*/ FFMA R8, R66.reuse, R75.reuse, R8; /* 0x5980040004b74208 */ + /* 0x101fc440fe0207f1 */ + /*0488*/ FFMA R10, R64.reuse, R75, R10; /* 0x5980050004b7400a */ + /*0490*/ { FFMA R11, R64.reuse, R74, R11; /* 0x5980058004a7400b */ + /*0498*/ @P0 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf; } /* 0xdd38000787107060 */ + /* 0x101cc480fe0607e1 */ + /*04a8*/ FFMA R17, R66.reuse, R76.reuse, R17; /* 0x5980088004c74211 */ + /*04b0*/ { FFMA R16, R66, R77.reuse, R16; /* 0x5980080004d74210 */ + /*04b8*/ @P0 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf; } /* 0xdd38000787107464 */ + /* 0x181fc480fe2607f1 */ + /*04c8*/ FFMA R18, R64.reuse, R77.reuse, R18; /* 0x5980090004d74012 */ + /*04d0*/ FFMA R19, R64, R76.reuse, R19; /* 0x5980098004c74013 */ + /*04d8*/ FFMA R21, R67.reuse, R76.reuse, R21; /* 0x59800a8004c74315 */ + /* 0x101fc4c0fe2407f1 */ + /*04e8*/ FFMA R20, R67, R77.reuse, R20; /* 0x59800a0004d74314 */ + /*04f0*/ FFMA R22, R65.reuse, R77.reuse, R22; /* 0x59800b0004d74116 */ + /*04f8*/ FFMA R23, R65, R76.reuse, R23; /* 0x59800b8004c74117 */ + /* 0x181fc480fe2607f1 */ + /*0508*/ FFMA R49, R70.reuse, R76.reuse, R49; /* 0x5980188004c74631 */ + /*0510*/ FFMA R48, R70, R77.reuse, R48; /* 0x5980180004d74630 */ + /*0518*/ FFMA R50, R68.reuse, R77.reuse, R50; /* 0x5980190004d74432 */ + /* 0x181fc4c0fe2407f1 */ + /*0528*/ FFMA R51, R68, R76.reuse, R51; /* 0x5980198004c74433 */ + /*0530*/ FFMA R53, R71.reuse, R76.reuse, R53; /* 0x59801a8004c74735 */ + /*0538*/ FFMA R52, R71.reuse, R77.reuse, R52; /* 0x59801a0004d74734 */ + /* 0x181fc440fe2207f1 */ + /*0548*/ FFMA R54, R69.reuse, R77, R54; /* 0x59801b0004d74536 */ + /*0550*/ FFMA R55, R69.reuse, R76, R55; /* 0x59801b8004c74537 */ + /*0558*/ FFMA R61, R71.reuse, R78.reuse, R61; /* 0x59801e8004e7473d */ + /* 0x101fc4c0fe2407f1 */ + /*0568*/ FFMA R60, R71, R79.reuse, R60; /* 0x59801e0004f7473c */ + /*0570*/ FFMA R62, R69.reuse, R79.reuse, R62; /* 0x59801f0004f7453e */ + /*0578*/ FFMA R63, R69, R78.reuse, R63; /* 0x59801f8004e7453f */ + /* 0x181fc480fe2607f1 */ + /*0588*/ FFMA R57, R70.reuse, R78.reuse, R57; /* 0x59801c8004e74639 */ + /*0590*/ FFMA R56, R70, R79.reuse, R56; /* 0x59801c0004f74638 */ + /*0598*/ FFMA R58, R68.reuse, R79.reuse, R58; /* 0x59801d0004f7443a */ + /* 0x101fc4c0fe2407f1 */ + /*05a8*/ FFMA R59, R68, R78.reuse, R59; /* 0x59801d8004e7443b */ + /*05b0*/ FFMA R29, R67.reuse, R78.reuse, R29; /* 0x59800e8004e7431d */ + /*05b8*/ FFMA R28, R67, R79.reuse, R28; /* 0x59800e0004f7431c */ + /* 0x181fc480fe2607f1 */ + /*05c8*/ FFMA R30, R65.reuse, R79.reuse, R30; /* 0x59800f0004f7411e */ + /*05d0*/ FFMA R31, R65, R78.reuse, R31; /* 0x59800f8004e7411f */ + /*05d8*/ FFMA R25, R66.reuse, R78.reuse, R25; /* 0x59800c8004e74219 */ + /* 0x001fc440fe2407f1 */ + /*05e8*/ FFMA R24, R66, R79.reuse, R24; /* 0x59800c0004f74218 */ + /*05f0*/ FFMA R26, R64.reuse, R79, R26; /* 0x59800d0004f7401a */ + /*05f8*/ FFMA R27, R64, R78, R27; /* 0x59800d8004e7401b */ + /* 0x101fc400fe260ff0 */ + /*0608*/ { FFMA R1, R82.reuse, R88.reuse, R1; /* 0x5980008005875201 */ + /*0610*/ LDS.U.128 R64, [R114+0x200]; } /* 0xef4e100020077240 */ + /*0618*/ FFMA R0, R82, R89.reuse, R0; /* 0x5980000005975200 */ + /* 0x101fc400fe2607f0 */ + /*0628*/ { FFMA R2, R80.reuse, R89.reuse, R2; /* 0x5980010005975002 */ + /*0630*/ LDS.U.128 R72, [R115+0x200]; } /* 0xef4e100020077348 */ + /*0638*/ FFMA R3, R80, R88.reuse, R3; /* 0x5980018005875003 */ + /* 0x101fc400fe2607f0 */ + /*0648*/ { FFMA R5, R83.reuse, R88.reuse, R5; /* 0x5980028005875305 */ + /*0650*/ LDS.U.128 R68, [R114+0x280]; } /* 0xef4e100028077244 */ + /*0658*/ FFMA R4, R83, R89.reuse, R4; /* 0x5980020005975304 */ + /* 0x101fc400e22607f0 */ + /*0668*/ { FFMA R6, R81.reuse, R89.reuse, R6; /* 0x5980030005975106 */ + /*0670*/ LDS.U.128 R76, [R115+0x280]; } /* 0xef4e10002807734c */ + /*0678*/ FFMA R7, R81, R88.reuse, R7; /* 0x5980038005875107 */ + /* 0x181fc480fe2607f1 */ + /*0688*/ FFMA R33, R86.reuse, R88.reuse, R33; /* 0x5980108005875621 */ + /*0690*/ FFMA R32, R86, R89.reuse, R32; /* 0x5980100005975620 */ + /*0698*/ FFMA R34, R84.reuse, R89.reuse, R34; /* 0x5980110005975422 */ + /* 0x181fc4c0fe2407f1 */ + /*06a8*/ FFMA R35, R84, R88.reuse, R35; /* 0x5980118005875423 */ + /*06b0*/ FFMA R37, R87.reuse, R88.reuse, R37; /* 0x5980128005875725 */ + /*06b8*/ FFMA R36, R87.reuse, R89.reuse, R36; /* 0x5980120005975724 */ + /* 0x181fc440fe2207f1 */ + /*06c8*/ FFMA R38, R85.reuse, R89, R38; /* 0x5980130005975526 */ + /*06d0*/ FFMA R39, R85.reuse, R88, R39; /* 0x5980138005875527 */ + /*06d8*/ FFMA R45, R87.reuse, R90.reuse, R45; /* 0x5980168005a7572d */ + /* 0x101fc4c0fe2407f1 */ + /*06e8*/ FFMA R44, R87, R91.reuse, R44; /* 0x5980160005b7572c */ + /*06f0*/ FFMA R46, R85.reuse, R91.reuse, R46; /* 0x5980170005b7552e */ + /*06f8*/ FFMA R47, R85, R90.reuse, R47; /* 0x5980178005a7552f */ + /* 0x181fc480fe2607f1 */ + /*0708*/ FFMA R41, R86.reuse, R90.reuse, R41; /* 0x5980148005a75629 */ + /*0710*/ FFMA R40, R86, R91.reuse, R40; /* 0x5980140005b75628 */ + /*0718*/ FFMA R42, R84.reuse, R91.reuse, R42; /* 0x5980150005b7542a */ + /* 0x101fc4c0fe2407f1 */ + /*0728*/ FFMA R43, R84, R90.reuse, R43; /* 0x5980158005a7542b */ + /*0730*/ FFMA R13, R83.reuse, R90.reuse, R13; /* 0x5980068005a7530d */ + /*0738*/ FFMA R12, R83, R91.reuse, R12; /* 0x5980060005b7530c */ + /* 0x181fc480fe2607f1 */ + /*0748*/ FFMA R14, R81.reuse, R91.reuse, R14; /* 0x5980070005b7510e */ + /*0750*/ FFMA R15, R81, R90.reuse, R15; /* 0x5980078005a7510f */ + /*0758*/ FFMA R9, R82.reuse, R90.reuse, R9; /* 0x5980048005a75209 */ + /* 0x081fc040fe2607f1 */ + /*0768*/ FFMA R8, R82.reuse, R91.reuse, R8; /* 0x5980040005b75208 */ + /*0770*/ FFMA R10, R80.reuse, R91, R10; /* 0x5980050005b7500a */ + /*0778*/ { FFMA R11, R80.reuse, R90, R11; /* 0x5980058005a7500b */ + /*0788*/ @P0 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf; } /* 0x101fc0c0fc2407f1 */ + /* 0xdd38000787107868 */ + /*0790*/ FFMA R17, R82.reuse, R92.reuse, R17; /* 0x5980088005c75211 */ + /*0798*/ { FFMA R16, R82, R93.reuse, R16; /* 0x5980080005d75210 */ + /*07a8*/ @P0 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf; } /* 0x101fc4c0fe240751 */ + /* 0xdd38000787107c6c */ + /*07b0*/ FFMA R18, R80.reuse, R93.reuse, R18; /* 0x5980090005d75012 */ + /*07b8*/ FFMA R19, R80, R92.reuse, R19; /* 0x5980098005c75013 */ + /* 0x181fc480fe2607f1 */ + /*07c8*/ FFMA R21, R83.reuse, R92.reuse, R21; /* 0x59800a8005c75315 */ + /*07d0*/ FFMA R20, R83, R93.reuse, R20; /* 0x59800a0005d75314 */ + /*07d8*/ FFMA R22, R81.reuse, R93.reuse, R22; /* 0x59800b0005d75116 */ + /* 0x101fc4c0fe2407f1 */ + /*07e8*/ FFMA R23, R81, R92.reuse, R23; /* 0x59800b8005c75117 */ + /*07f0*/ FFMA R49, R86.reuse, R92.reuse, R49; /* 0x5980188005c75631 */ + /*07f8*/ FFMA R48, R86, R93.reuse, R48; /* 0x5980180005d75630 */ + /* 0x181fc480fe2607f1 */ + /*0808*/ FFMA R50, R84.reuse, R93.reuse, R50; /* 0x5980190005d75432 */ + /*0810*/ FFMA R51, R84, R92.reuse, R51; /* 0x5980198005c75433 */ + /*0818*/ FFMA R53, R87.reuse, R92.reuse, R53; /* 0x59801a8005c75735 */ + /* 0x081fc440fe2607f1 */ + /*0828*/ FFMA R52, R87.reuse, R93.reuse, R52; /* 0x59801a0005d75734 */ + /*0830*/ FFMA R54, R85.reuse, R93, R54; /* 0x59801b0005d75536 */ + /*0838*/ FFMA R55, R85.reuse, R92, R55; /* 0x59801b8005c75537 */ + /* 0x181fc480fe2607f1 */ + /*0848*/ FFMA R61, R87.reuse, R94.reuse, R61; /* 0x59801e8005e7573d */ + /*0850*/ FFMA R60, R87, R95.reuse, R60; /* 0x59801e0005f7573c */ + /*0858*/ FFMA R62, R85.reuse, R95.reuse, R62; /* 0x59801f0005f7553e */ + /* 0x101fc4c0fe2407f1 */ + /*0868*/ FFMA R63, R85, R94.reuse, R63; /* 0x59801f8005e7553f */ + /*0870*/ FFMA R57, R86.reuse, R94.reuse, R57; /* 0x59801c8005e75639 */ + /*0878*/ FFMA R56, R86, R95.reuse, R56; /* 0x59801c0005f75638 */ + /* 0x181fc480fe2607f1 */ + /*0888*/ FFMA R58, R84.reuse, R95.reuse, R58; /* 0x59801d0005f7543a */ + /*0890*/ FFMA R59, R84, R94.reuse, R59; /* 0x59801d8005e7543b */ + /*0898*/ FFMA R29, R83.reuse, R94.reuse, R29; /* 0x59800e8005e7531d */ + /* 0x101fc4c0fe2407f1 */ + /*08a8*/ FFMA R28, R83, R95.reuse, R28; /* 0x59800e0005f7531c */ + /*08b0*/ FFMA R30, R81.reuse, R95.reuse, R30; /* 0x59800f0005f7511e */ + /*08b8*/ FFMA R31, R81, R94.reuse, R31; /* 0x59800f8005e7511f */ + /* 0x081fc480fe2607f1 */ + /*08c8*/ FFMA R25, R82.reuse, R94.reuse, R25; /* 0x59800c8005e75219 */ + /*08d0*/ FFMA R24, R82, R95.reuse, R24; /* 0x59800c0005f75218 */ + /*08d8*/ FFMA R26, R80.reuse, R95, R26; /* 0x59800d0005f7501a */ + /* 0x001fc4c1fe0007f1 */ + /*08e8*/ FFMA R27, R80, R94, R27; /* 0x59800d8005e7501b */ + /*08f0*/ { FFMA R1, R66.reuse, R72.reuse, R1; /* 0x5980008004874201 */ + /*08f8*/ LDS.U.128 R80, [R114+0x300]; } /* 0xef4e100030077250 */ + /* 0x001fc4c0fe0407f1 */ + /*0908*/ FFMA R0, R66, R73.reuse, R0; /* 0x5980000004974200 */ + /*0910*/ { FFMA R2, R64.reuse, R73.reuse, R2; /* 0x5980010004974002 */ + /*0918*/ LDS.U.128 R88, [R115+0x300]; } /* 0xef4e100030077358 */ + /* 0x001fc4c0fe0407f1 */ + /*0928*/ FFMA R3, R64, R72.reuse, R3; /* 0x5980018004874003 */ + /*0930*/ { FFMA R5, R67.reuse, R72.reuse, R5; /* 0x5980028004874305 */ + /*0938*/ LDS.U.128 R84, [R114+0x380]; } /* 0xef4e100038077254 */ + /* 0x001c44c0fe0407f1 */ + /*0948*/ FFMA R4, R67, R73.reuse, R4; /* 0x5980020004974304 */ + /*0950*/ { FFMA R6, R65.reuse, R73.reuse, R6; /* 0x5980030004974106 */ + /*0958*/ LDS.U.128 R92, [R115+0x380]; } /* 0xef4e10003807735c */ + /* 0x101fc4c0fe2407f1 */ + /*0968*/ FFMA R7, R65, R72.reuse, R7; /* 0x5980038004874107 */ + /*0970*/ FFMA R33, R70.reuse, R72.reuse, R33; /* 0x5980108004874621 */ + /*0978*/ FFMA R32, R70, R73.reuse, R32; /* 0x5980100004974620 */ + /* 0x181fc480fe2607f1 */ + /*0988*/ FFMA R34, R68.reuse, R73.reuse, R34; /* 0x5980110004974422 */ + /*0990*/ FFMA R35, R68, R72.reuse, R35; /* 0x5980118004874423 */ + /*0998*/ FFMA R37, R71.reuse, R72.reuse, R37; /* 0x5980128004874725 */ + /* 0x081fc440fe2607f1 */ + /*09a8*/ FFMA R36, R71.reuse, R73.reuse, R36; /* 0x5980120004974724 */ + /*09b0*/ FFMA R38, R69.reuse, R73, R38; /* 0x5980130004974526 */ + /*09b8*/ FFMA R39, R69.reuse, R72, R39; /* 0x5980138004874527 */ + /* 0x181fc480fe2607f1 */ + /*09c8*/ FFMA R45, R71.reuse, R74.reuse, R45; /* 0x5980168004a7472d */ + /*09d0*/ FFMA R44, R71, R75.reuse, R44; /* 0x5980160004b7472c */ + /*09d8*/ FFMA R46, R69.reuse, R75.reuse, R46; /* 0x5980170004b7452e */ + /* 0x101fc4c0fe2407f1 */ + /*09e8*/ FFMA R47, R69, R74.reuse, R47; /* 0x5980178004a7452f */ + /*09f0*/ FFMA R41, R70.reuse, R74.reuse, R41; /* 0x5980148004a74629 */ + /*09f8*/ FFMA R40, R70, R75.reuse, R40; /* 0x5980140004b74628 */ + /* 0x181fc480fe2607f1 */ + /*0a08*/ FFMA R42, R68.reuse, R75.reuse, R42; /* 0x5980150004b7442a */ + /*0a10*/ FFMA R43, R68, R74.reuse, R43; /* 0x5980158004a7442b */ + /*0a18*/ FFMA R13, R67.reuse, R74.reuse, R13; /* 0x5980068004a7430d */ + /* 0x101fc4c0fe2407f1 */ + /*0a28*/ FFMA R12, R67, R75.reuse, R12; /* 0x5980060004b7430c */ + /*0a30*/ FFMA R14, R65.reuse, R75.reuse, R14; /* 0x5980070004b7410e */ + /*0a38*/ FFMA R15, R65, R74.reuse, R15; /* 0x5980078004a7410f */ + /* 0x081fc4c0fe2607f1 */ + /*0a48*/ FFMA R9, R66.reuse, R74.reuse, R9; /* 0x5980048004a74209 */ + /*0a50*/ FFMA R8, R66.reuse, R75.reuse, R8; /* 0x5980040004b74208 */ + /*0a58*/ FFMA R10, R64.reuse, R75, R10; /* 0x5980050004b7400a */ + /* 0x101fc4c0fc2207f1 */ + /*0a68*/ FFMA R11, R64.reuse, R74, R11; /* 0x5980058004a7400b */ + /*0a70*/ FFMA R17, R66.reuse, R76.reuse, R17; /* 0x5980088004c74211 */ + /*0a78*/ FFMA R16, R66, R77.reuse, R16; /* 0x5980080004d74210 */ + /* 0x181fc480fe2607f1 */ + /*0a88*/ FFMA R18, R64.reuse, R77.reuse, R18; /* 0x5980090004d74012 */ + /*0a90*/ FFMA R19, R64, R76.reuse, R19; /* 0x5980098004c74013 */ + /*0a98*/ FFMA R21, R67.reuse, R76.reuse, R21; /* 0x59800a8004c74315 */ + /* 0x101fc4c0fe2407f1 */ + /*0aa8*/ FFMA R20, R67, R77.reuse, R20; /* 0x59800a0004d74314 */ + /*0ab0*/ FFMA R22, R65.reuse, R77.reuse, R22; /* 0x59800b0004d74116 */ + /*0ab8*/ FFMA R23, R65, R76.reuse, R23; /* 0x59800b8004c74117 */ + /* 0x181fc480fe2607f1 */ + /*0ac8*/ FFMA R49, R70.reuse, R76.reuse, R49; /* 0x5980188004c74631 */ + /*0ad0*/ FFMA R48, R70, R77.reuse, R48; /* 0x5980180004d74630 */ + /*0ad8*/ FFMA R50, R68.reuse, R77.reuse, R50; /* 0x5980190004d74432 */ + /* 0x181fc4c0fe2407f1 */ + /*0ae8*/ FFMA R51, R68, R76.reuse, R51; /* 0x5980198004c74433 */ + /*0af0*/ FFMA R53, R71.reuse, R76.reuse, R53; /* 0x59801a8004c74735 */ + /*0af8*/ FFMA R52, R71.reuse, R77.reuse, R52; /* 0x59801a0004d74734 */ + /* 0x181fc440fe2207f1 */ + /*0b08*/ FFMA R54, R69.reuse, R77, R54; /* 0x59801b0004d74536 */ + /*0b10*/ FFMA R55, R69.reuse, R76, R55; /* 0x59801b8004c74537 */ + /*0b18*/ FFMA R61, R71.reuse, R78.reuse, R61; /* 0x59801e8004e7473d */ + /* 0x101fc4c0fe2407f1 */ + /*0b28*/ FFMA R60, R71, R79.reuse, R60; /* 0x59801e0004f7473c */ + /*0b30*/ FFMA R62, R69.reuse, R79.reuse, R62; /* 0x59801f0004f7453e */ + /*0b38*/ FFMA R63, R69, R78.reuse, R63; /* 0x59801f8004e7453f */ + /* 0x181fc480fe2607f1 */ + /*0b48*/ FFMA R57, R70.reuse, R78.reuse, R57; /* 0x59801c8004e74639 */ + /*0b50*/ FFMA R56, R70, R79.reuse, R56; /* 0x59801c0004f74638 */ + /*0b58*/ FFMA R58, R68.reuse, R79.reuse, R58; /* 0x59801d0004f7443a */ + /* 0x101fc4c0fe2407f1 */ + /*0b68*/ FFMA R59, R68, R78.reuse, R59; /* 0x59801d8004e7443b */ + /*0b70*/ FFMA R29, R67.reuse, R78.reuse, R29; /* 0x59800e8004e7431d */ + /*0b78*/ FFMA R28, R67, R79.reuse, R28; /* 0x59800e0004f7431c */ + /* 0x181fc480fe2607f1 */ + /*0b88*/ FFMA R30, R65.reuse, R79.reuse, R30; /* 0x59800f0004f7411e */ + /*0b90*/ FFMA R31, R65, R78.reuse, R31; /* 0x59800f8004e7411f */ + /*0b98*/ FFMA R25, R66.reuse, R78.reuse, R25; /* 0x59800c8004e74219 */ + /* 0x001fc440fe2407f1 */ + /*0ba8*/ FFMA R24, R66, R79.reuse, R24; /* 0x59800c0004f74218 */ + /*0bb0*/ FFMA R26, R64.reuse, R79, R26; /* 0x59800d0004f7401a */ + /*0bb8*/ FFMA R27, R64, R78, R27; /* 0x59800d8004e7401b */ + /* 0x101fc400fe260ff0 */ + /*0bc8*/ { FFMA R1, R82.reuse, R88.reuse, R1; /* 0x5980008005875201 */ + /*0bd0*/ LDS.U.128 R64, [R114+0x400]; } /* 0xef4e100040077240 */ + /*0bd8*/ FFMA R0, R82, R89.reuse, R0; /* 0x5980000005975200 */ + /* 0x101fc400fe2607f0 */ + /*0be8*/ { FFMA R2, R80.reuse, R89.reuse, R2; /* 0x5980010005975002 */ + /*0bf0*/ LDS.U.128 R72, [R115+0x400]; } /* 0xef4e100040077348 */ + /*0bf8*/ FFMA R3, R80, R88.reuse, R3; /* 0x5980018005875003 */ + /* 0x101fc400fe2607f0 */ + /*0c08*/ { FFMA R5, R83.reuse, R88.reuse, R5; /* 0x5980028005875305 */ + /*0c10*/ LDS.U.128 R68, [R114+0x480]; } /* 0xef4e100048077244 */ + /*0c18*/ FFMA R4, R83, R89.reuse, R4; /* 0x5980020005975304 */ + /* 0x101fc400e22607f0 */ + /*0c28*/ { FFMA R6, R81.reuse, R89.reuse, R6; /* 0x5980030005975106 */ + /*0c30*/ LDS.U.128 R76, [R115+0x480]; } /* 0xef4e10004807734c */ + /*0c38*/ FFMA R7, R81, R88.reuse, R7; /* 0x5980038005875107 */ + /* 0x181fc480fe2607f1 */ + /*0c48*/ FFMA R33, R86.reuse, R88.reuse, R33; /* 0x5980108005875621 */ + /*0c50*/ FFMA R32, R86, R89.reuse, R32; /* 0x5980100005975620 */ + /*0c58*/ FFMA R34, R84.reuse, R89.reuse, R34; /* 0x5980110005975422 */ + /* 0x181fc4c0fe2407f1 */ + /*0c68*/ FFMA R35, R84, R88.reuse, R35; /* 0x5980118005875423 */ + /*0c70*/ FFMA R37, R87.reuse, R88.reuse, R37; /* 0x5980128005875725 */ + /*0c78*/ FFMA R36, R87.reuse, R89.reuse, R36; /* 0x5980120005975724 */ + /* 0x181fc440fe2207f1 */ + /*0c88*/ FFMA R38, R85.reuse, R89, R38; /* 0x5980130005975526 */ + /*0c90*/ FFMA R39, R85.reuse, R88, R39; /* 0x5980138005875527 */ + /*0c98*/ FFMA R45, R87.reuse, R90.reuse, R45; /* 0x5980168005a7572d */ + /* 0x101fc4c0fe2407f1 */ + /*0ca8*/ FFMA R44, R87, R91.reuse, R44; /* 0x5980160005b7572c */ + /*0cb0*/ FFMA R46, R85.reuse, R91.reuse, R46; /* 0x5980170005b7552e */ + /*0cb8*/ FFMA R47, R85, R90.reuse, R47; /* 0x5980178005a7552f */ + /* 0x181fc480fe2607f1 */ + /*0cc8*/ FFMA R41, R86.reuse, R90.reuse, R41; /* 0x5980148005a75629 */ + /*0cd0*/ FFMA R40, R86, R91.reuse, R40; /* 0x5980140005b75628 */ + /*0cd8*/ FFMA R42, R84.reuse, R91.reuse, R42; /* 0x5980150005b7542a */ + /* 0x101fc4c0fe2407f1 */ + /*0ce8*/ FFMA R43, R84, R90.reuse, R43; /* 0x5980158005a7542b */ + /*0cf0*/ FFMA R13, R83.reuse, R90.reuse, R13; /* 0x5980068005a7530d */ + /*0cf8*/ FFMA R12, R83, R91.reuse, R12; /* 0x5980060005b7530c */ + /* 0x181fc480fe2607f1 */ + /*0d08*/ FFMA R14, R81.reuse, R91.reuse, R14; /* 0x5980070005b7510e */ + /*0d10*/ FFMA R15, R81, R90.reuse, R15; /* 0x5980078005a7510f */ + /*0d18*/ FFMA R9, R82.reuse, R90.reuse, R9; /* 0x5980048005a75209 */ + /* 0x081fc440fe2607f1 */ + /*0d28*/ FFMA R8, R82.reuse, R91.reuse, R8; /* 0x5980040005b75208 */ + /*0d30*/ FFMA R10, R80.reuse, R91, R10; /* 0x5980050005b7500a */ + /*0d38*/ FFMA R11, R80.reuse, R90, R11; /* 0x5980058005a7500b */ + /* 0x181fc480fe2607e1 */ + /*0d48*/ FFMA R17, R82.reuse, R92.reuse, R17; /* 0x5980088005c75211 */ + /*0d50*/ FFMA R16, R82, R93.reuse, R16; /* 0x5980080005d75210 */ + /*0d58*/ FFMA R18, R80.reuse, R93.reuse, R18; /* 0x5980090005d75012 */ + /* 0x101fc4c0fe2407f1 */ + /*0d68*/ FFMA R19, R80, R92.reuse, R19; /* 0x5980098005c75013 */ + /*0d70*/ FFMA R21, R83.reuse, R92.reuse, R21; /* 0x59800a8005c75315 */ + /*0d78*/ FFMA R20, R83, R93.reuse, R20; /* 0x59800a0005d75314 */ + /* 0x181fc480fe2607f1 */ + /*0d88*/ FFMA R22, R81.reuse, R93.reuse, R22; /* 0x59800b0005d75116 */ + /*0d90*/ FFMA R23, R81, R92.reuse, R23; /* 0x59800b8005c75117 */ + /*0d98*/ FFMA R49, R86.reuse, R92.reuse, R49; /* 0x5980188005c75631 */ + /* 0x101fc4c0fe2407f1 */ + /*0da8*/ FFMA R48, R86, R93.reuse, R48; /* 0x5980180005d75630 */ + /*0db0*/ FFMA R50, R84.reuse, R93.reuse, R50; /* 0x5980190005d75432 */ + /*0db8*/ FFMA R51, R84, R92.reuse, R51; /* 0x5980198005c75433 */ + /* 0x081fc4c0fe2607f1 */ + /*0dc8*/ FFMA R53, R87.reuse, R92.reuse, R53; /* 0x59801a8005c75735 */ + /*0dd0*/ FFMA R52, R87.reuse, R93.reuse, R52; /* 0x59801a0005d75734 */ + /*0dd8*/ FFMA R54, R85.reuse, R93, R54; /* 0x59801b0005d75536 */ + /* 0x101fc4c0fe2207f1 */ + /*0de8*/ FFMA R55, R85.reuse, R92, R55; /* 0x59801b8005c75537 */ + /*0df0*/ FFMA R61, R87.reuse, R94.reuse, R61; /* 0x59801e8005e7573d */ + /*0df8*/ FFMA R60, R87, R95.reuse, R60; /* 0x59801e0005f7573c */ + /* 0x181fc480fe2607f1 */ + /*0e08*/ FFMA R62, R85.reuse, R95.reuse, R62; /* 0x59801f0005f7553e */ + /*0e10*/ FFMA R63, R85, R94.reuse, R63; /* 0x59801f8005e7553f */ + /*0e18*/ FFMA R57, R86.reuse, R94.reuse, R57; /* 0x59801c8005e75639 */ + /* 0x101fc4c0fe2407f1 */ + /*0e28*/ FFMA R56, R86, R95.reuse, R56; /* 0x59801c0005f75638 */ + /*0e30*/ FFMA R58, R84.reuse, R95.reuse, R58; /* 0x59801d0005f7543a */ + /*0e38*/ FFMA R59, R84, R94.reuse, R59; /* 0x59801d8005e7543b */ + /* 0x181fc480fe2607f1 */ + /*0e48*/ FFMA R29, R83.reuse, R94.reuse, R29; /* 0x59800e8005e7531d */ + /*0e50*/ FFMA R28, R83, R95.reuse, R28; /* 0x59800e0005f7531c */ + /*0e58*/ FFMA R30, R81.reuse, R95.reuse, R30; /* 0x59800f0005f7511e */ + /* 0x101fc4c0fe2407f1 */ + /*0e68*/ FFMA R31, R81, R94.reuse, R31; /* 0x59800f8005e7511f */ + /*0e70*/ FFMA R25, R82.reuse, R94.reuse, R25; /* 0x59800c8005e75219 */ + /*0e78*/ FFMA R24, R82, R95.reuse, R24; /* 0x59800c0005f75218 */ + /* 0x183fc000fe2207f1 */ + /*0e88*/ FFMA R26, R80.reuse, R95, R26; /* 0x59800d0005f7501a */ + /*0e90*/ FFMA R27, R80, R94, R27; /* 0x59800d8005e7501b */ + /*0e98*/ { FFMA R1, R66.reuse, R72.reuse, R1; /* 0x5980008004874201 */ + /*0ea8*/ LDS.U.128 R80, [R114+0x500]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100050077250 */ + /*0eb0*/ FFMA R0, R66, R73.reuse, R0; /* 0x5980000004974200 */ + /*0eb8*/ { FFMA R2, R64.reuse, R73.reuse, R2; /* 0x5980010004974002 */ + /*0ec8*/ LDS.U.128 R88, [R115+0x500]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100050077358 */ + /*0ed0*/ FFMA R3, R64, R72.reuse, R3; /* 0x5980018004874003 */ + /*0ed8*/ { FFMA R5, R67.reuse, R72.reuse, R5; /* 0x5980028004874305 */ + /*0ee8*/ LDS.U.128 R84, [R114+0x580]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100058077254 */ + /*0ef0*/ FFMA R4, R67, R73.reuse, R4; /* 0x5980020004974304 */ + /*0ef8*/ { FFMA R6, R65.reuse, R73.reuse, R6; /* 0x5980030004974106 */ + /*0f08*/ LDS.U.128 R92, [R115+0x580]; } /* 0x181fc480fe200711 */ + /* 0xef4e10005807735c */ + /*0f10*/ FFMA R7, R65, R72.reuse, R7; /* 0x5980038004874107 */ + /*0f18*/ FFMA R33, R70.reuse, R72.reuse, R33; /* 0x5980108004874621 */ + /* 0x101fc4c0fe2407f1 */ + /*0f28*/ FFMA R32, R70, R73.reuse, R32; /* 0x5980100004974620 */ + /*0f30*/ FFMA R34, R68.reuse, R73.reuse, R34; /* 0x5980110004974422 */ + /*0f38*/ FFMA R35, R68, R72.reuse, R35; /* 0x5980118004874423 */ + /* 0x081fc4c0fe2607f1 */ + /*0f48*/ FFMA R37, R71.reuse, R72.reuse, R37; /* 0x5980128004874725 */ + /*0f50*/ FFMA R36, R71.reuse, R73.reuse, R36; /* 0x5980120004974724 */ + /*0f58*/ FFMA R38, R69.reuse, R73, R38; /* 0x5980130004974526 */ + /* 0x101fc4c0fe2207f1 */ + /*0f68*/ FFMA R39, R69.reuse, R72, R39; /* 0x5980138004874527 */ + /*0f70*/ FFMA R45, R71.reuse, R74.reuse, R45; /* 0x5980168004a7472d */ + /*0f78*/ FFMA R44, R71, R75.reuse, R44; /* 0x5980160004b7472c */ + /* 0x181fc480fe2607f1 */ + /*0f88*/ FFMA R46, R69.reuse, R75.reuse, R46; /* 0x5980170004b7452e */ + /*0f90*/ FFMA R47, R69, R74.reuse, R47; /* 0x5980178004a7452f */ + /*0f98*/ FFMA R41, R70.reuse, R74.reuse, R41; /* 0x5980148004a74629 */ + /* 0x101fc4c0fe2407f1 */ + /*0fa8*/ FFMA R40, R70, R75.reuse, R40; /* 0x5980140004b74628 */ + /*0fb0*/ FFMA R42, R68.reuse, R75.reuse, R42; /* 0x5980150004b7442a */ + /*0fb8*/ FFMA R43, R68, R74.reuse, R43; /* 0x5980158004a7442b */ + /* 0x181fc480fe2607f1 */ + /*0fc8*/ FFMA R13, R67.reuse, R74.reuse, R13; /* 0x5980068004a7430d */ + /*0fd0*/ FFMA R12, R67, R75.reuse, R12; /* 0x5980060004b7430c */ + /*0fd8*/ FFMA R14, R65.reuse, R75.reuse, R14; /* 0x5980070004b7410e */ + /* 0x181fc4c0fe2407f1 */ + /*0fe8*/ FFMA R15, R65, R74.reuse, R15; /* 0x5980078004a7410f */ + /*0ff0*/ FFMA R9, R66.reuse, R74.reuse, R9; /* 0x5980048004a74209 */ + /*0ff8*/ FFMA R8, R66.reuse, R75.reuse, R8; /* 0x5980040004b74208 */ + /* 0x181f8440fe2207f1 */ + /*1008*/ FFMA R10, R64.reuse, R75, R10; /* 0x5980050004b7400a */ + /*1010*/ FFMA R11, R64.reuse, R74, R11; /* 0x5980058004a7400b */ + /*1018*/ FFMA R17, R66.reuse, R76.reuse, R17; /* 0x5980088004c74211 */ + /* 0x101fc4c0fe2407f1 */ + /*1028*/ FFMA R16, R66, R77.reuse, R16; /* 0x5980080004d74210 */ + /*1030*/ FFMA R18, R64.reuse, R77.reuse, R18; /* 0x5980090004d74012 */ + /*1038*/ FFMA R19, R64, R76.reuse, R19; /* 0x5980098004c74013 */ + /* 0x181fc480fe2607f1 */ + /*1048*/ FFMA R21, R67.reuse, R76.reuse, R21; /* 0x59800a8004c74315 */ + /*1050*/ FFMA R20, R67, R77.reuse, R20; /* 0x59800a0004d74314 */ + /*1058*/ FFMA R22, R65.reuse, R77.reuse, R22; /* 0x59800b0004d74116 */ + /* 0x101fc4c0fe2407f1 */ + /*1068*/ FFMA R23, R65, R76.reuse, R23; /* 0x59800b8004c74117 */ + /*1070*/ FFMA R49, R70.reuse, R76.reuse, R49; /* 0x5980188004c74631 */ + /*1078*/ FFMA R48, R70, R77.reuse, R48; /* 0x5980180004d74630 */ + /* 0x181fc480fe2607f1 */ + /*1088*/ FFMA R50, R68.reuse, R77.reuse, R50; /* 0x5980190004d74432 */ + /*1090*/ FFMA R51, R68, R76.reuse, R51; /* 0x5980198004c74433 */ + /*1098*/ FFMA R53, R71.reuse, R76.reuse, R53; /* 0x59801a8004c74735 */ + /* 0x081fc440fe2607f1 */ + /*10a8*/ FFMA R52, R71.reuse, R77.reuse, R52; /* 0x59801a0004d74734 */ + /*10b0*/ FFMA R54, R69.reuse, R77, R54; /* 0x59801b0004d74536 */ + /*10b8*/ FFMA R55, R69.reuse, R76, R55; /* 0x59801b8004c74537 */ + /* 0x181fc480fe2607f1 */ + /*10c8*/ FFMA R61, R71.reuse, R78.reuse, R61; /* 0x59801e8004e7473d */ + /*10d0*/ FFMA R60, R71, R79.reuse, R60; /* 0x59801e0004f7473c */ + /*10d8*/ FFMA R62, R69.reuse, R79.reuse, R62; /* 0x59801f0004f7453e */ + /* 0x101fc4c0fe2407f1 */ + /*10e8*/ FFMA R63, R69, R78.reuse, R63; /* 0x59801f8004e7453f */ + /*10f0*/ FFMA R57, R70.reuse, R78.reuse, R57; /* 0x59801c8004e74639 */ + /*10f8*/ FFMA R56, R70, R79.reuse, R56; /* 0x59801c0004f74638 */ + /* 0x181fc480fe2607f1 */ + /*1108*/ FFMA R58, R68.reuse, R79.reuse, R58; /* 0x59801d0004f7443a */ + /*1110*/ FFMA R59, R68, R78.reuse, R59; /* 0x59801d8004e7443b */ + /*1118*/ FFMA R29, R67.reuse, R78.reuse, R29; /* 0x59800e8004e7431d */ + /* 0x101fc4c0fe2407f1 */ + /*1128*/ FFMA R28, R67, R79.reuse, R28; /* 0x59800e0004f7431c */ + /*1130*/ FFMA R30, R65.reuse, R79.reuse, R30; /* 0x59800f0004f7411e */ + /*1138*/ FFMA R31, R65, R78.reuse, R31; /* 0x59800f8004e7411f */ + /* 0x081fc480fe2607f1 */ + /*1148*/ FFMA R25, R66.reuse, R78.reuse, R25; /* 0x59800c8004e74219 */ + /*1150*/ FFMA R24, R66, R79.reuse, R24; /* 0x59800c0004f74218 */ + /*1158*/ FFMA R26, R64.reuse, R79, R26; /* 0x59800d0004f7401a */ + /* 0x001fc4c1fe0007f1 */ + /*1168*/ FFMA R27, R64, R78, R27; /* 0x59800d8004e7401b */ + /*1170*/ { FFMA R1, R82.reuse, R88.reuse, R1; /* 0x5980008005875201 */ + /*1178*/ LDS.U.128 R64, [R114+0x600]; } /* 0xef4e100060077240 */ + /* 0x001fc4c0fe0407f1 */ + /*1188*/ FFMA R0, R82, R89.reuse, R0; /* 0x5980000005975200 */ + /*1190*/ { FFMA R2, R80.reuse, R89.reuse, R2; /* 0x5980010005975002 */ + /*1198*/ LDS.U.128 R72, [R115+0x600]; } /* 0xef4e100060077348 */ + /* 0x001fc4c0fe0407f1 */ + /*11a8*/ FFMA R3, R80, R88.reuse, R3; /* 0x5980018005875003 */ + /*11b0*/ { FFMA R5, R83.reuse, R88.reuse, R5; /* 0x5980028005875305 */ + /*11b8*/ LDS.U.128 R68, [R114+0x680]; } /* 0xef4e100068077244 */ + /* 0x001c44c0fe0407f1 */ + /*11c8*/ FFMA R4, R83, R89.reuse, R4; /* 0x5980020005975304 */ + /*11d0*/ { FFMA R6, R81.reuse, R89.reuse, R6; /* 0x5980030005975106 */ + /*11d8*/ LDS.U.128 R76, [R115+0x680]; } /* 0xef4e10006807734c */ + /* 0x101fc4c0fe2407f1 */ + /*11e8*/ FFMA R7, R81, R88.reuse, R7; /* 0x5980038005875107 */ + /*11f0*/ FFMA R33, R86.reuse, R88.reuse, R33; /* 0x5980108005875621 */ + /*11f8*/ FFMA R32, R86, R89.reuse, R32; /* 0x5980100005975620 */ + /* 0x181fc480fe2607f1 */ + /*1208*/ FFMA R34, R84.reuse, R89.reuse, R34; /* 0x5980110005975422 */ + /*1210*/ FFMA R35, R84, R88.reuse, R35; /* 0x5980118005875423 */ + /*1218*/ FFMA R37, R87.reuse, R88.reuse, R37; /* 0x5980128005875725 */ + /* 0x081fc440fe2607f1 */ + /*1228*/ FFMA R36, R87.reuse, R89.reuse, R36; /* 0x5980120005975724 */ + /*1230*/ FFMA R38, R85.reuse, R89, R38; /* 0x5980130005975526 */ + /*1238*/ FFMA R39, R85.reuse, R88, R39; /* 0x5980138005875527 */ + /* 0x181fc480fe2607f1 */ + /*1248*/ FFMA R45, R87.reuse, R90.reuse, R45; /* 0x5980168005a7572d */ + /*1250*/ FFMA R44, R87, R91.reuse, R44; /* 0x5980160005b7572c */ + /*1258*/ FFMA R46, R85.reuse, R91.reuse, R46; /* 0x5980170005b7552e */ + /* 0x101fc4c0fe2407f1 */ + /*1268*/ FFMA R47, R85, R90.reuse, R47; /* 0x5980178005a7552f */ + /*1270*/ FFMA R41, R86.reuse, R90.reuse, R41; /* 0x5980148005a75629 */ + /*1278*/ FFMA R40, R86, R91.reuse, R40; /* 0x5980140005b75628 */ + /* 0x181fc480fe2607f1 */ + /*1288*/ FFMA R42, R84.reuse, R91.reuse, R42; /* 0x5980150005b7542a */ + /*1290*/ FFMA R43, R84, R90.reuse, R43; /* 0x5980158005a7542b */ + /*1298*/ FFMA R13, R83.reuse, R90.reuse, R13; /* 0x5980068005a7530d */ + /* 0x101fc4c0fe2407f1 */ + /*12a8*/ FFMA R12, R83, R91.reuse, R12; /* 0x5980060005b7530c */ + /*12b0*/ FFMA R14, R81.reuse, R91.reuse, R14; /* 0x5980070005b7510e */ + /*12b8*/ FFMA R15, R81, R90.reuse, R15; /* 0x5980078005a7510f */ + /* 0x081fc0c0fe2607f1 */ + /*12c8*/ FFMA R9, R82.reuse, R90.reuse, R9; /* 0x5980048005a75209 */ + /*12d0*/ FFMA R8, R82.reuse, R91.reuse, R8; /* 0x5980040005b75208 */ + /*12d8*/ { FFMA R10, R80.reuse, R91, R10; /* 0x5980050005b7500a */ + /*12e8*/ @P0 STS.128 [R118], R96; } /* 0x181f8440fe2017f1 */ + /* 0xef5e000000007660 */ + /*12f0*/ FFMA R11, R80.reuse, R90, R11; /* 0x5980058005a7500b */ + /*12f8*/ FFMA R17, R82.reuse, R92.reuse, R17; /* 0x5980088005c75211 */ + /* 0x001fc4c0fe0407f1 */ + /*1308*/ FFMA R16, R82, R93.reuse, R16; /* 0x5980080005d75210 */ + /*1310*/ { FFMA R18, R80.reuse, R93.reuse, R18; /* 0x5980090005d75012 */ + /*1318*/ @P0 STS.128 [R118+0x200], R100; } /* 0xef5e000020007664 */ + /* 0x101fc4c0fe2407f1 */ + /*1328*/ FFMA R19, R80, R92.reuse, R19; /* 0x5980098005c75013 */ + /*1330*/ FFMA R21, R83.reuse, R92.reuse, R21; /* 0x59800a8005c75315 */ + /*1338*/ FFMA R20, R83, R93.reuse, R20; /* 0x59800a0005d75314 */ + /* 0x181fc480fe2607f1 */ + /*1348*/ FFMA R22, R81.reuse, R93.reuse, R22; /* 0x59800b0005d75116 */ + /*1350*/ FFMA R23, R81, R92.reuse, R23; /* 0x59800b8005c75117 */ + /*1358*/ FFMA R49, R86.reuse, R92.reuse, R49; /* 0x5980188005c75631 */ + /* 0x101fc4c0fe2407f1 */ + /*1368*/ FFMA R48, R86, R93.reuse, R48; /* 0x5980180005d75630 */ + /*1370*/ FFMA R50, R84.reuse, R93.reuse, R50; /* 0x5980190005d75432 */ + /*1378*/ FFMA R51, R84, R92.reuse, R51; /* 0x5980198005c75433 */ + /* 0x081fc4c0fe2607f1 */ + /*1388*/ FFMA R53, R87.reuse, R92.reuse, R53; /* 0x59801a8005c75735 */ + /*1390*/ FFMA R52, R87.reuse, R93.reuse, R52; /* 0x59801a0005d75734 */ + /*1398*/ FFMA R54, R85.reuse, R93, R54; /* 0x59801b0005d75536 */ + /* 0x101fc4c0fe2207f1 */ + /*13a8*/ FFMA R55, R85.reuse, R92, R55; /* 0x59801b8005c75537 */ + /*13b0*/ FFMA R61, R87.reuse, R94.reuse, R61; /* 0x59801e8005e7573d */ + /*13b8*/ FFMA R60, R87, R95.reuse, R60; /* 0x59801e0005f7573c */ + /* 0x181fc480fe2607f1 */ + /*13c8*/ FFMA R62, R85.reuse, R95.reuse, R62; /* 0x59801f0005f7553e */ + /*13d0*/ FFMA R63, R85, R94.reuse, R63; /* 0x59801f8005e7553f */ + /*13d8*/ FFMA R57, R86.reuse, R94.reuse, R57; /* 0x59801c8005e75639 */ + /* 0x101fc4c0fe2407f1 */ + /*13e8*/ FFMA R56, R86, R95.reuse, R56; /* 0x59801c0005f75638 */ + /*13f0*/ FFMA R58, R84.reuse, R95.reuse, R58; /* 0x59801d0005f7543a */ + /*13f8*/ FFMA R59, R84, R94.reuse, R59; /* 0x59801d8005e7543b */ + /* 0x181fc480fe2607f1 */ + /*1408*/ FFMA R29, R83.reuse, R94.reuse, R29; /* 0x59800e8005e7531d */ + /*1410*/ FFMA R28, R83, R95.reuse, R28; /* 0x59800e0005f7531c */ + /*1418*/ FFMA R30, R81.reuse, R95.reuse, R30; /* 0x59800f0005f7511e */ + /* 0x101fc4c0fe2407f1 */ + /*1428*/ FFMA R31, R81, R94.reuse, R31; /* 0x59800f8005e7511f */ + /*1430*/ FFMA R25, R82.reuse, R94.reuse, R25; /* 0x59800c8005e75219 */ + /*1438*/ FFMA R24, R82, R95.reuse, R24; /* 0x59800c0005f75218 */ + /* 0x183fc000fe2207f1 */ + /*1448*/ FFMA R26, R80.reuse, R95, R26; /* 0x59800d0005f7501a */ + /*1450*/ FFMA R27, R80, R94, R27; /* 0x59800d8005e7501b */ + /*1458*/ { FFMA R1, R66.reuse, R72.reuse, R1; /* 0x5980008004874201 */ + /*1468*/ LDS.U.128 R80, [R114+0x700]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100070077250 */ + /*1470*/ FFMA R0, R66, R73.reuse, R0; /* 0x5980000004974200 */ + /*1478*/ { FFMA R2, R64.reuse, R73.reuse, R2; /* 0x5980010004974002 */ + /*1488*/ LDS.U.128 R88, [R115+0x700]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100070077358 */ + /*1490*/ FFMA R3, R64, R72.reuse, R3; /* 0x5980018004874003 */ + /*1498*/ { FFMA R5, R67.reuse, R72.reuse, R5; /* 0x5980028004874305 */ + /*14a8*/ LDS.U.128 R84, [R114+0x780]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100078077254 */ + /*14b0*/ FFMA R4, R67, R73.reuse, R4; /* 0x5980020004974304 */ + /*14b8*/ { FFMA R6, R65.reuse, R73.reuse, R6; /* 0x5980030004974106 */ + /*14c8*/ LDS.U.128 R92, [R115+0x780]; } /* 0x181fc480fe200711 */ + /* 0xef4e10007807735c */ + /*14d0*/ FFMA R7, R65, R72.reuse, R7; /* 0x5980038004874107 */ + /*14d8*/ FFMA R33, R70.reuse, R72.reuse, R33; /* 0x5980108004874621 */ + /* 0x101fc4c0fe2407f1 */ + /*14e8*/ FFMA R32, R70, R73.reuse, R32; /* 0x5980100004974620 */ + /*14f0*/ FFMA R34, R68.reuse, R73.reuse, R34; /* 0x5980110004974422 */ + /*14f8*/ FFMA R35, R68, R72.reuse, R35; /* 0x5980118004874423 */ + /* 0x081fc4c0fe2607f1 */ + /*1508*/ FFMA R37, R71.reuse, R72.reuse, R37; /* 0x5980128004874725 */ + /*1510*/ FFMA R36, R71.reuse, R73.reuse, R36; /* 0x5980120004974724 */ + /*1518*/ FFMA R38, R69.reuse, R73, R38; /* 0x5980130004974526 */ + /* 0x101fc4c0fe2207f1 */ + /*1528*/ FFMA R39, R69.reuse, R72, R39; /* 0x5980138004874527 */ + /*1530*/ FFMA R45, R71.reuse, R74.reuse, R45; /* 0x5980168004a7472d */ + /*1538*/ FFMA R44, R71, R75.reuse, R44; /* 0x5980160004b7472c */ + /* 0x181fc480fe2607f1 */ + /*1548*/ FFMA R46, R69.reuse, R75.reuse, R46; /* 0x5980170004b7452e */ + /*1550*/ FFMA R47, R69, R74.reuse, R47; /* 0x5980178004a7452f */ + /*1558*/ FFMA R41, R70.reuse, R74.reuse, R41; /* 0x5980148004a74629 */ + /* 0x101fc4c0fe2407f1 */ + /*1568*/ FFMA R40, R70, R75.reuse, R40; /* 0x5980140004b74628 */ + /*1570*/ FFMA R42, R68.reuse, R75.reuse, R42; /* 0x5980150004b7442a */ + /*1578*/ FFMA R43, R68, R74.reuse, R43; /* 0x5980158004a7442b */ + /* 0x181fc480fe2607f1 */ + /*1588*/ FFMA R13, R67.reuse, R74.reuse, R13; /* 0x5980068004a7430d */ + /*1590*/ FFMA R12, R67, R75.reuse, R12; /* 0x5980060004b7430c */ + /*1598*/ FFMA R14, R65.reuse, R75.reuse, R14; /* 0x5980070004b7410e */ + /* 0x181fc4c0fe2407f1 */ + /*15a8*/ FFMA R15, R65, R74.reuse, R15; /* 0x5980078004a7410f */ + /*15b0*/ FFMA R9, R66.reuse, R74.reuse, R9; /* 0x5980048004a74209 */ + /*15b8*/ FFMA R8, R66.reuse, R75.reuse, R8; /* 0x5980040004b74208 */ + /* 0x081fc404fe2207f0 */ + /*15c8*/ { FFMA R10, R64.reuse, R75, R10; /* 0x5980050004b7400a */ + /*15d0*/ @P0 STS.128 [R118+0x400], R104; } /* 0xef5e000040007668 */ + /*15d8*/ FFMA R11, R64.reuse, R74, R11; /* 0x5980058004a7400b */ + /* 0x181fc080fe2607e1 */ + /*15e8*/ FFMA R17, R66.reuse, R76.reuse, R17; /* 0x5980088004c74211 */ + /*15f0*/ FFMA R16, R66, R77.reuse, R16; /* 0x5980080004d74210 */ + /*15f8*/ { FFMA R18, R64.reuse, R77.reuse, R18; /* 0x5980090004d74012 */ + /*1608*/ @P0 STS.128 [R118+0x600], R108; } /* 0x181fc480fe2007f1 */ + /* 0xef5e00006000766c */ + /*1610*/ FFMA R19, R64, R76.reuse, R19; /* 0x5980098004c74013 */ + /*1618*/ FFMA R21, R67.reuse, R76.reuse, R21; /* 0x59800a8004c74315 */ + /* 0x101fc4c0fe2407f1 */ + /*1628*/ FFMA R20, R67, R77.reuse, R20; /* 0x59800a0004d74314 */ + /*1630*/ FFMA R22, R65.reuse, R77.reuse, R22; /* 0x59800b0004d74116 */ + /*1638*/ FFMA R23, R65, R76.reuse, R23; /* 0x59800b8004c74117 */ + /* 0x181fc480fe2607f1 */ + /*1648*/ FFMA R49, R70.reuse, R76.reuse, R49; /* 0x5980188004c74631 */ + /*1650*/ FFMA R48, R70, R77.reuse, R48; /* 0x5980180004d74630 */ + /*1658*/ FFMA R50, R68.reuse, R77.reuse, R50; /* 0x5980190004d74432 */ + /* 0x181fc4c0fe2407f1 */ + /*1668*/ FFMA R51, R68, R76.reuse, R51; /* 0x5980198004c74433 */ + /*1670*/ FFMA R53, R71.reuse, R76.reuse, R53; /* 0x59801a8004c74735 */ + /*1678*/ FFMA R52, R71.reuse, R77.reuse, R52; /* 0x59801a0004d74734 */ + /* 0x181fc440fe2207f1 */ + /*1688*/ FFMA R54, R69.reuse, R77, R54; /* 0x59801b0004d74536 */ + /*1690*/ FFMA R55, R69.reuse, R76, R55; /* 0x59801b8004c74537 */ + /*1698*/ FFMA R61, R71.reuse, R78.reuse, R61; /* 0x59801e8004e7473d */ + /* 0x101fc4c0fe2407f1 */ + /*16a8*/ FFMA R60, R71, R79.reuse, R60; /* 0x59801e0004f7473c */ + /*16b0*/ FFMA R62, R69.reuse, R79.reuse, R62; /* 0x59801f0004f7453e */ + /*16b8*/ FFMA R63, R69, R78.reuse, R63; /* 0x59801f8004e7453f */ + /* 0x181fc480fe2607f1 */ + /*16c8*/ FFMA R57, R70.reuse, R78.reuse, R57; /* 0x59801c8004e74639 */ + /*16d0*/ FFMA R56, R70, R79.reuse, R56; /* 0x59801c0004f74638 */ + /*16d8*/ FFMA R58, R68.reuse, R79.reuse, R58; /* 0x59801d0004f7443a */ + /* 0x101fc4c0fe2407f1 */ + /*16e8*/ FFMA R59, R68, R78.reuse, R59; /* 0x59801d8004e7443b */ + /*16f0*/ FFMA R29, R67.reuse, R78.reuse, R29; /* 0x59800e8004e7431d */ + /*16f8*/ FFMA R28, R67, R79.reuse, R28; /* 0x59800e0004f7431c */ + /* 0x181fc480fe2607f1 */ + /*1708*/ FFMA R30, R65.reuse, R79.reuse, R30; /* 0x59800f0004f7411e */ + /*1710*/ FFMA R31, R65, R78.reuse, R31; /* 0x59800f8004e7411f */ + /*1718*/ FFMA R25, R66.reuse, R78.reuse, R25; /* 0x59800c8004e74219 */ + /* 0x003fd440fe0407f1 */ + /*1728*/ FFMA R24, R66, R79.reuse, R24; /* 0x59800c0004f74218 */ + /*1730*/ { FFMA R26, R64.reuse, R79, R26; /* 0x59800d0004f7401a */ + /*1738*/ BAR.SYNC 0x0; } /* 0xf0a81b8000070000 */ + /* 0x001fc400fe2007f1 */ + /*1748*/ @P0 LOP.XOR R114, R114, 0x1000; /* 0x3847040100007272 */ + /*1750*/ @P0 LOP.XOR R115, R115, 0x1000; /* 0x3847040100007373 */ + /*1758*/ @P0 LOP.XOR R118, R118, 0x1000; /* 0x3847040100007676 */ + /* 0x001fc4c0fe0007f1 */ + /*1768*/ FFMA R27, R64, R78, R27; /* 0x59800d8004e7401b */ + /*1770*/ { FFMA R1, R82.reuse, R88.reuse, R1; /* 0x5980008005875201 */ + /*1778*/ @P0 LDS.U.128 R64, [R114]; } /* 0xef4e100000007240 */ + /* 0x001fc4c0fe0407f1 */ + /*1788*/ FFMA R0, R82, R89.reuse, R0; /* 0x5980000005975200 */ + /*1790*/ { FFMA R2, R80.reuse, R89.reuse, R2; /* 0x5980010005975002 */ + /*1798*/ @P0 LDS.U.128 R72, [R115]; } /* 0xef4e100000007348 */ + /* 0x001fc4c0fe0407f1 */ + /*17a8*/ FFMA R3, R80, R88.reuse, R3; /* 0x5980018005875003 */ + /*17b0*/ { FFMA R5, R83.reuse, R88.reuse, R5; /* 0x5980028005875305 */ + /*17b8*/ @P0 LDS.U.128 R68, [R114+0x80]; } /* 0xef4e100008007244 */ + /* 0x001c44c0fe0407f1 */ + /*17c8*/ FFMA R4, R83, R89.reuse, R4; /* 0x5980020005975304 */ + /*17d0*/ { FFMA R6, R81.reuse, R89.reuse, R6; /* 0x5980030005975106 */ + /*17d8*/ @P0 LDS.U.128 R76, [R115+0x80]; } /* 0xef4e10000800734c */ + /* 0x101fc4c0fe2407f1 */ + /*17e8*/ FFMA R7, R81, R88.reuse, R7; /* 0x5980038005875107 */ + /*17f0*/ FFMA R33, R86.reuse, R88.reuse, R33; /* 0x5980108005875621 */ + /*17f8*/ FFMA R32, R86, R89.reuse, R32; /* 0x5980100005975620 */ + /* 0x181fc480fe2607f1 */ + /*1808*/ FFMA R34, R84.reuse, R89.reuse, R34; /* 0x5980110005975422 */ + /*1810*/ FFMA R35, R84, R88.reuse, R35; /* 0x5980118005875423 */ + /*1818*/ FFMA R37, R87.reuse, R88.reuse, R37; /* 0x5980128005875725 */ + /* 0x081fc440fe2607f1 */ + /*1828*/ FFMA R36, R87.reuse, R89.reuse, R36; /* 0x5980120005975724 */ + /*1830*/ FFMA R38, R85.reuse, R89, R38; /* 0x5980130005975526 */ + /*1838*/ FFMA R39, R85.reuse, R88, R39; /* 0x5980138005875527 */ + /* 0x181fc480fe2607f1 */ + /*1848*/ FFMA R45, R87.reuse, R90.reuse, R45; /* 0x5980168005a7572d */ + /*1850*/ FFMA R44, R87, R91.reuse, R44; /* 0x5980160005b7572c */ + /*1858*/ FFMA R46, R85.reuse, R91.reuse, R46; /* 0x5980170005b7552e */ + /* 0x101fc4c0fe2407f1 */ + /*1868*/ FFMA R47, R85, R90.reuse, R47; /* 0x5980178005a7552f */ + /*1870*/ FFMA R41, R86.reuse, R90.reuse, R41; /* 0x5980148005a75629 */ + /*1878*/ FFMA R40, R86, R91.reuse, R40; /* 0x5980140005b75628 */ + /* 0x181fc480fe2607f1 */ + /*1888*/ FFMA R42, R84.reuse, R91.reuse, R42; /* 0x5980150005b7542a */ + /*1890*/ FFMA R43, R84, R90.reuse, R43; /* 0x5980158005a7542b */ + /*1898*/ FFMA R13, R83.reuse, R90.reuse, R13; /* 0x5980068005a7530d */ + /* 0x101fc4c0fe2407f1 */ + /*18a8*/ FFMA R12, R83, R91.reuse, R12; /* 0x5980060005b7530c */ + /*18b0*/ FFMA R14, R81.reuse, R91.reuse, R14; /* 0x5980070005b7510e */ + /*18b8*/ FFMA R15, R81, R90.reuse, R15; /* 0x5980078005a7510f */ + /* 0x081fc4c0fe2607f1 */ + /*18c8*/ FFMA R9, R82.reuse, R90.reuse, R9; /* 0x5980048005a75209 */ + /*18d0*/ FFMA R8, R82.reuse, R91.reuse, R8; /* 0x5980040005b75208 */ + /*18d8*/ FFMA R10, R80.reuse, R91, R10; /* 0x5980050005b7500a */ + /* 0x101fc4c0fc2207f1 */ + /*18e8*/ FFMA R11, R80.reuse, R90, R11; /* 0x5980058005a7500b */ + /*18f0*/ FFMA R17, R82.reuse, R92.reuse, R17; /* 0x5980088005c75211 */ + /*18f8*/ FFMA R16, R82, R93.reuse, R16; /* 0x5980080005d75210 */ + /* 0x181fc480fe2607f1 */ + /*1908*/ FFMA R18, R80.reuse, R93.reuse, R18; /* 0x5980090005d75012 */ + /*1910*/ FFMA R19, R80, R92.reuse, R19; /* 0x5980098005c75013 */ + /*1918*/ FFMA R21, R83.reuse, R92.reuse, R21; /* 0x59800a8005c75315 */ + /* 0x101fc4c0fe2407f1 */ + /*1928*/ FFMA R20, R83, R93.reuse, R20; /* 0x59800a0005d75314 */ + /*1930*/ FFMA R22, R81.reuse, R93.reuse, R22; /* 0x59800b0005d75116 */ + /*1938*/ FFMA R23, R81, R92.reuse, R23; /* 0x59800b8005c75117 */ + /* 0x181fc480fe2607f1 */ + /*1948*/ FFMA R49, R86.reuse, R92.reuse, R49; /* 0x5980188005c75631 */ + /*1950*/ FFMA R48, R86, R93.reuse, R48; /* 0x5980180005d75630 */ + /*1958*/ FFMA R50, R84.reuse, R93.reuse, R50; /* 0x5980190005d75432 */ + /* 0x181fc4c0fe2407f1 */ + /*1968*/ FFMA R51, R84, R92.reuse, R51; /* 0x5980198005c75433 */ + /*1970*/ FFMA R53, R87.reuse, R92.reuse, R53; /* 0x59801a8005c75735 */ + /*1978*/ FFMA R52, R87.reuse, R93.reuse, R52; /* 0x59801a0005d75734 */ + /* 0x181fc440fe2207f1 */ + /*1988*/ FFMA R54, R85.reuse, R93, R54; /* 0x59801b0005d75536 */ + /*1990*/ FFMA R55, R85.reuse, R92, R55; /* 0x59801b8005c75537 */ + /*1998*/ FFMA R61, R87.reuse, R94.reuse, R61; /* 0x59801e8005e7573d */ + /* 0x101fc4c0fe2407f1 */ + /*19a8*/ FFMA R60, R87, R95.reuse, R60; /* 0x59801e0005f7573c */ + /*19b0*/ FFMA R62, R85.reuse, R95.reuse, R62; /* 0x59801f0005f7553e */ + /*19b8*/ FFMA R63, R85, R94.reuse, R63; /* 0x59801f8005e7553f */ + /* 0x181fc480fe2607f1 */ + /*19c8*/ FFMA R57, R86.reuse, R94.reuse, R57; /* 0x59801c8005e75639 */ + /*19d0*/ FFMA R56, R86, R95.reuse, R56; /* 0x59801c0005f75638 */ + /*19d8*/ FFMA R58, R84.reuse, R95.reuse, R58; /* 0x59801d0005f7543a */ + /* 0x101fc4c0fe2407f1 */ + /*19e8*/ FFMA R59, R84, R94.reuse, R59; /* 0x59801d8005e7543b */ + /*19f0*/ FFMA R29, R83.reuse, R94.reuse, R29; /* 0x59800e8005e7531d */ + /*19f8*/ FFMA R28, R83, R95.reuse, R28; /* 0x59800e0005f7531c */ + /* 0x181fc480fe2607f1 */ + /*1a08*/ FFMA R30, R81.reuse, R95.reuse, R30; /* 0x59800f0005f7511e */ + /*1a10*/ FFMA R31, R81, R94.reuse, R31; /* 0x59800f8005e7511f */ + /*1a18*/ FFMA R25, R82.reuse, R94.reuse, R25; /* 0x59800c8005e75219 */ + /* 0x001fc440fe2407f1 */ + /*1a28*/ FFMA R24, R82, R95.reuse, R24; /* 0x59800c0005f75218 */ + /*1a30*/ FFMA R26, R80.reuse, R95, R26; /* 0x59800d0005f7501a */ + /*1a38*/ FFMA R27, R80, R94, R27; /* 0x59800d8005e7501b */ + /* 0x101fc480fe2407f1 */ + /*1a48*/ @P0 IADD R112, R112, R121.reuse; /* 0x5c10000007907070 */ + /*1a50*/ @P0 IADD R116, R116, R121.reuse; /* 0x5c10000007907474 */ + /*1a58*/ @P0 IADD R120, R120, R121.reuse; /* 0x5c10000007907878 */ + /* 0x081fc400fca007f0 */ + /*1a68*/ { @P0 IADD R124, R124, R121; /* 0x5c10000007907c7c */ + /*1a70*/ @P0 BRA 0x310; } /* 0xe2400ffe8980000f */ + /*1a78*/ SHR.U32 R80, R123.reuse, 0x1; /* 0x3828000000177b50 */ + /* 0x001fc480fe2007f1 */ + /*1a88*/ MOV R81, c[0x0][0x158]; /* 0x4c98078005670051 */ + /*1a90*/ ISCADD R84, R125, R126.reuse, 0x6; /* 0x5c18030007e77d54 */ + /*1a98*/ MOV R72, c[0x0][0x15c]; /* 0x4c98078005770048 */ + /* 0x001fc400fe2007f1 */ + /*1aa8*/ ISCADD R92, R123, R126, 0x3; /* 0x5c18018007e77b5c */ + /*1ab0*/ LOP.AND R114, R114, 0x7ff; /* 0x384700007ff77272 */ + /*1ab8*/ ISCADD R80, R122, R80, 0x6; /* 0x5c18030005077a50 */ + /* 0x001fc440fe2007f1 */ + /*1ac8*/ LOP.AND R115, R115, 0x7ff; /* 0x384700007ff77373 */ + /*1ad0*/ SHL R77, R81.reuse, 0x2; /* 0x384800000027514d */ + /*1ad8*/ ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT; /* 0x4b6303800517542f */ + /* 0x081fc400fe2207f1 */ + /*1ae8*/ SHL R89, R81.reuse, 0x4; /* 0x3848000000475159 */ + /*1af0*/ FMUL R64, R3, R72; /* 0x5c68000004870340 */ + /*1af8*/ SHL R91, R81.reuse, 0x5; /* 0x384800000057515b */ + /* 0x001fc400fe2607f1 */ + /*1b08*/ XMAD.MRG R74, R80.reuse, R81.H1.reuse, RZ; /* 0x5b007fa80517504a */ + /*1b10*/ ISCADD R93, R115, R114, 0x4; /* 0x5c1802000727735d */ + /*1b18*/ XMAD R73, R80, R81, R84; /* 0x5b002a0005175049 */ + /* 0x001fc400fe2007f1 */ + /*1b28*/ SHL R92, R92, 0x2; /* 0x3848000000275c5c */ + /*1b30*/ IADD R84, R84, 0x20; /* 0x3810000002075454 */ + /*1b38*/ ISCADD R85, R81, -R89, 0x7; /* 0x5c19038005975155 */ + /* 0x001fc480fe2407f1 */ + /*1b48*/ FMUL R65, R7, R72.reuse; /* 0x5c68000004870741 */ + /*1b50*/ FMUL R66, R1, R72.reuse; /* 0x5c68000004870142 */ + /*1b58*/ XMAD.PSL.CBCC R73, R80.H1, R74.H1, R73; /* 0x5b30249804a75049 */ + /* 0x101fc400fe2007f1 */ + /*1b68*/ IADD R80, R80, -0x1; /* 0x3910007ffff75050 */ + /*1b70*/ ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT; /* 0x4b63038005175437 */ + /*1b78*/ FMUL R67, R5, R72.reuse; /* 0x5c68000004870543 */ + /* 0x001fc480fe2407f1 */ + /*1b88*/ FMUL R68, R35, R72.reuse; /* 0x5c68000004872344 */ + /*1b90*/ FMUL R69, R39, R72.reuse; /* 0x5c68000004872745 */ + /*1b98*/ ISCADD R76, R73, c[0x0][0x140], 0x2; /* 0x4c1801000507494c */ + /* 0x001fc440fe2207f1 */ + /*1ba8*/ IADD R86, R80.reuse, 0x4; /* 0x3810000000475056 */ + /*1bb0*/ IADD R87, R80.reuse, 0x8; /* 0x3810000000875057 */ + /*1bb8*/ IADD R88, R80, 0xc; /* 0x3810000000c75058 */ + /* 0x001f9800fe2407f1 */ + /*1bc8*/ FMUL R70, R33, R72.reuse; /* 0x5c68000004872146 */ + /*1bd0*/ FMUL R71, R37, R72; /* 0x5c68000004872547 */ + /*1bd8*/ IADD R76, R76, -R77; /* 0x5c11000004d74c4c */ + /* 0x001fc080fca207f1 */ + /*1be8*/ IADD R75, R76.reuse, R89; /* 0x5c10000005974c4b */ + /*1bf0*/ IADD R78, R76, R91.reuse; /* 0x5c10000005b74c4e */ + /*1bf8*/ { IADD R79, R75, R91; /* 0x5c10000005b74b4f */ + /*1c08*/ CAL 0x1f10; } /* 0x101fc482fe2007f5 */ + /* 0xe260000030000040 */ + /*1c10*/ FMUL R64, R2, R72.reuse; /* 0x5c68000004870240 */ + /*1c18*/ FMUL R65, R6, R72.reuse; /* 0x5c68000004870641 */ + /* 0x101fc480fe2407f1 */ + /*1c28*/ FMUL R66, R0, R72.reuse; /* 0x5c68000004870042 */ + /*1c30*/ FMUL R67, R4, R72.reuse; /* 0x5c68000004870443 */ + /*1c38*/ FMUL R68, R34, R72.reuse; /* 0x5c68000004872244 */ + /* 0x001fc080fe2407f1 */ + /*1c48*/ FMUL R69, R38, R72.reuse; /* 0x5c68000004872645 */ + /*1c50*/ FMUL R70, R32, R72.reuse; /* 0x5c68000004872046 */ + /*1c58*/ { FMUL R71, R36, R72; /* 0x5c68000004872447 */ + /*1c68*/ CAL 0x1f10; } /* 0x101fc482fe2007f5 */ + /* 0xe26000002a000040 */ + /*1c70*/ FMUL R64, R11, R72.reuse; /* 0x5c68000004870b40 */ + /*1c78*/ FMUL R65, R15, R72.reuse; /* 0x5c68000004870f41 */ + /* 0x101fc480fe2407f1 */ + /*1c88*/ FMUL R66, R9, R72.reuse; /* 0x5c68000004870942 */ + /*1c90*/ FMUL R67, R13, R72.reuse; /* 0x5c68000004870d43 */ + /*1c98*/ FMUL R68, R43, R72.reuse; /* 0x5c68000004872b44 */ + /* 0x001fc080fe2407f1 */ + /*1ca8*/ FMUL R69, R47, R72.reuse; /* 0x5c68000004872f45 */ + /*1cb0*/ FMUL R70, R41, R72.reuse; /* 0x5c68000004872946 */ + /*1cb8*/ { FMUL R71, R45, R72; /* 0x5c68000004872d47 */ + /*1cc8*/ CAL 0x1f10; } /* 0x101fc482fe2007f5 */ + /* 0xe260000024000040 */ + /*1cd0*/ FMUL R64, R10, R72.reuse; /* 0x5c68000004870a40 */ + /*1cd8*/ FMUL R65, R14, R72.reuse; /* 0x5c68000004870e41 */ + /* 0x101fc480fe2407f1 */ + /*1ce8*/ FMUL R66, R8, R72.reuse; /* 0x5c68000004870842 */ + /*1cf0*/ FMUL R67, R12, R72.reuse; /* 0x5c68000004870c43 */ + /*1cf8*/ FMUL R68, R42, R72.reuse; /* 0x5c68000004872a44 */ + /* 0x001fc080fe2407f1 */ + /*1d08*/ FMUL R69, R46, R72.reuse; /* 0x5c68000004872e45 */ + /*1d10*/ FMUL R70, R40, R72.reuse; /* 0x5c68000004872846 */ + /*1d18*/ { FMUL R71, R44, R72; /* 0x5c68000004872c47 */ + /*1d28*/ CAL 0x1f10; } /* 0x001fc400fe2007f5 */ + /* 0xe26000001e000040 */ + /*1d30*/ IADD R80, R80, 0x1c; /* 0x3810000001c75050 */ + /*1d38*/ IADD R86, R86, 0x1c; /* 0x3810000001c75656 */ + /* 0x105fc400fe2007f1 */ + /*1d48*/ IADD R87, R87, 0x1c; /* 0x3810000001c75757 */ + /*1d50*/ IADD R88, R88, 0x1c; /* 0x3810000001c75858 */ + /*1d58*/ IADD R76, R76, R85.reuse; /* 0x5c10000005574c4c */ + /* 0x001fc480fe2407f1 */ + /*1d68*/ IADD R75, R75, R85.reuse; /* 0x5c10000005574b4b */ + /*1d70*/ IADD R78, R78, R85.reuse; /* 0x5c10000005574e4e */ + /*1d78*/ IADD R79, R79, R85; /* 0x5c10000005574f4f */ + /* 0x101fc480fe2407f1 */ + /*1d88*/ FMUL R64, R19, R72.reuse; /* 0x5c68000004871340 */ + /*1d90*/ FMUL R65, R23, R72.reuse; /* 0x5c68000004871741 */ + /*1d98*/ FMUL R66, R17, R72.reuse; /* 0x5c68000004871142 */ + /* 0x101fc480fe2407f1 */ + /*1da8*/ FMUL R67, R21, R72.reuse; /* 0x5c68000004871543 */ + /*1db0*/ FMUL R68, R51, R72.reuse; /* 0x5c68000004873344 */ + /*1db8*/ FMUL R69, R55, R72.reuse; /* 0x5c68000004873745 */ + /* 0x001fd400fe0407f1 */ + /*1dc8*/ FMUL R70, R49, R72.reuse; /* 0x5c68000004873146 */ + /*1dd0*/ { FMUL R71, R53, R72; /* 0x5c68000004873547 */ + /*1dd8*/ CAL 0x1f10; } /* 0xe260000013000040 */ + /* 0x101fc480fe2417f1 */ + /*1de8*/ FMUL R64, R18, R72.reuse; /* 0x5c68000004871240 */ + /*1df0*/ FMUL R65, R22, R72.reuse; /* 0x5c68000004871641 */ + /*1df8*/ FMUL R66, R16, R72.reuse; /* 0x5c68000004871042 */ + /* 0x101fc480fe2407f1 */ + /*1e08*/ FMUL R67, R20, R72.reuse; /* 0x5c68000004871443 */ + /*1e10*/ FMUL R68, R50, R72.reuse; /* 0x5c68000004873244 */ + /*1e18*/ FMUL R69, R54, R72.reuse; /* 0x5c68000004873645 */ + /* 0x001fd400fe0407f1 */ + /*1e28*/ FMUL R70, R48, R72.reuse; /* 0x5c68000004873046 */ + /*1e30*/ { FMUL R71, R52, R72; /* 0x5c68000004873447 */ + /*1e38*/ CAL 0x1f10; } /* 0xe26000000d000040 */ + /* 0x101fc480fe2417f1 */ + /*1e48*/ FMUL R64, R27, R72.reuse; /* 0x5c68000004871b40 */ + /*1e50*/ FMUL R65, R31, R72.reuse; /* 0x5c68000004871f41 */ + /*1e58*/ FMUL R66, R25, R72.reuse; /* 0x5c68000004871942 */ + /* 0x101fc480fe2407f1 */ + /*1e68*/ FMUL R67, R29, R72.reuse; /* 0x5c68000004871d43 */ + /*1e70*/ FMUL R68, R59, R72.reuse; /* 0x5c68000004873b44 */ + /*1e78*/ FMUL R69, R63, R72.reuse; /* 0x5c68000004873f45 */ + /* 0x001fd400fe0407f1 */ + /*1e88*/ FMUL R70, R57, R72.reuse; /* 0x5c68000004873946 */ + /*1e90*/ { FMUL R71, R61, R72; /* 0x5c68000004873d47 */ + /*1e98*/ CAL 0x1f10; } /* 0xe260000007000040 */ + /* 0x101fc480fe2417f1 */ + /*1ea8*/ FMUL R64, R26, R72.reuse; /* 0x5c68000004871a40 */ + /*1eb0*/ FMUL R65, R30, R72.reuse; /* 0x5c68000004871e41 */ + /*1eb8*/ FMUL R66, R24, R72.reuse; /* 0x5c68000004871842 */ + /* 0x101fc480fe2407f1 */ + /*1ec8*/ FMUL R67, R28, R72.reuse; /* 0x5c68000004871c43 */ + /*1ed0*/ FMUL R68, R58, R72.reuse; /* 0x5c68000004873a44 */ + /*1ed8*/ FMUL R69, R62, R72.reuse; /* 0x5c68000004873e45 */ + /* 0x001fd400fe0407f1 */ + /*1ee8*/ FMUL R70, R56, R72.reuse; /* 0x5c68000004873846 */ + /*1ef0*/ { FMUL R71, R60, R72; /* 0x5c68000004873c47 */ + /*1ef8*/ CAL 0x1f10; } /* 0xe260000001000040 */ + /* 0x001fc400fe0007f5 */ + /*1f08*/ EXIT; /* 0xe30000000007000f */ + /*1f10*/ { IADD R80, R80, 0x1; /* 0x3810000000175050 */ + /*1f18*/ STS.128 [R93], R64; } /* 0xef5e000000075d40 */ + /* 0x001fc000fe2007f0 */ + /*1f28*/ { IADD R86, R86, 0x1; /* 0x3810000000175656 */ + /*1f30*/ STS.128 [R93+0x80], R68; } /* 0xef5e000008075d44 */ + /*1f38*/ { IADD R87, R87, 0x1; /* 0x3810000000175757 */ + /*1f48*/ LDS R64, [R92]; } /* 0x001fc400fe0007f1 */ + /* 0xef4c000000075c40 */ + /*1f50*/ { IADD R88, R88, 0x1; /* 0x3810000000175858 */ + /*1f58*/ LDS R65, [R92+0x80]; } /* 0xef4c000008075c41 */ + /* 0x101fc000fe2407f0 */ + /*1f68*/ { IADD R76, R76, R77.reuse; /* 0x5c10000004d74c4c */ + /*1f70*/ LDS R66, [R92+0x100]; } /* 0xef4c000010075c42 */ + /*1f78*/ { IADD R75, R75, R77.reuse; /* 0x5c10000004d74b4b */ + /*1f88*/ LDS R67, [R92+0x180]; } /* 0x001fc480fe0007f1 */ + /* 0xef4c000018075c43 */ + /*1f90*/ { IADD R78, R78, R77.reuse; /* 0x5c10000004d74e4e */ + /*1f98*/ LDS R68, [R92+0x200]; } /* 0xef4c000020075c44 */ + /* 0x081fc000fe2007f0 */ + /*1fa8*/ { IADD R79, R79, R77; /* 0x5c10000004d74f4f */ + /*1fb0*/ LDS R69, [R92+0x280]; } /* 0xef4c000028075c45 */ + /*1fb8*/ { ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5; /* 0x4b63028005275007 */ + /*1fc8*/ LDS R70, [R92+0x300]; } /* 0x001c4400fe0007f1 */ + /* 0xef4c000030075c46 */ + /*1fd0*/ { ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6; /* 0x4b6303000527500f */ + /*1fd8*/ LDS R71, [R92+0x380]; } /* 0xef4c000038075c47 */ + /* 0x003fc400fd2207f2 */ + /*1fe8*/ ISETP.LT.AND P2, PT, R86.reuse, c[0x0][0x148], P5; /* 0x4b63028005275617 */ + /*1ff0*/ ISETP.LT.AND P3, PT, R86, c[0x0][0x148], P6; /* 0x4b6303000527561f */ + /*1ff8*/ @P0 STG.CG [R76], R64; /* 0xeedc400000004c40 */ + /* 0x001fc000fe2207f0 */ + /*2008*/ { ISETP.LT.AND P0, PT, R87.reuse, c[0x0][0x148], P5; /* 0x4b63028005275707 */ + /*2010*/ @P1 STG.CG [R76+0x80], R65; } /* 0xeedc400008014c41 */ + /*2018*/ { ISETP.LT.AND P1, PT, R87, c[0x0][0x148], P6; /* 0x4b6303000527570f */ + /*2028*/ @P2 STG.CG [R75], R66; } /* 0x001fc440fe2007f1 */ + /* 0xeedc400000024b42 */ + /*2030*/ ISETP.LT.AND P2, PT, R88.reuse, c[0x0][0x148], P5; /* 0x4b63028005275817 */ + /*2038*/ @P3 STG.CG [R75+0x80], R67; /* 0xeedc400008034b43 */ + /* 0x001fc400fe2007e9 */ + /*2048*/ ISETP.LT.AND P3, PT, R88, c[0x0][0x148], P6; /* 0x4b6303000527581f */ + /*2050*/ @P0 STG.CG [R78], R68; /* 0xeedc400000004e44 */ + /*2058*/ @P1 STG.CG [R78+0x80], R69; /* 0xeedc400008014e45 */ + /* 0x001fd4003e2007f2 */ + /*2068*/ @P2 STG.CG [R79], R70; /* 0xeedc400000024f46 */ + /*2070*/ @P3 STG.CG [R79+0x80], R71; /* 0xeedc400008034f47 */ + /*2078*/ RET; /* 0xe32000000007000f */ + /* 0x001f8000fc0007ff */ + /*2088*/ BRA 0x2088; /* 0xe2400fffff87000f */ + /*2090*/ NOP; /* 0x50b0000000070f00 */ + /*2098*/ NOP; /* 0x50b0000000070f00 */ + /* 0x001f8000fc0007e0 */ + /*20a8*/ NOP; /* 0x50b0000000070f00 */ + /*20b0*/ NOP; /* 0x50b0000000070f00 */ + /*20b8*/ NOP; /* 0x50b0000000070f00 */ + ................................ + + diff --git a/Assembler/MaxAs/t/MaxAs-MaxAs.t b/Assembler/MaxAs/t/MaxAs-MaxAs.t new file mode 100644 index 0000000..ad9e988 --- /dev/null +++ b/Assembler/MaxAs/t/MaxAs-MaxAs.t @@ -0,0 +1,5 @@ +use strict; +use warnings; + +use Test::More tests => 1; +BEGIN { use_ok('MaxAs::MaxAs') }; diff --git a/Assembler/PascalAs/Changes b/Assembler/PascalAs/Changes new file mode 100644 index 0000000..a6d8a13 --- /dev/null +++ b/Assembler/PascalAs/Changes @@ -0,0 +1,4 @@ +Revision history for Perl extension MaxAs::MaxAs. + +1.01 Thu Mar 26 17:09:57 2015 + - original Perl packaged version diff --git a/Assembler/PascalAs/Install.sh b/Assembler/PascalAs/Install.sh new file mode 100755 index 0000000..57c8d24 --- /dev/null +++ b/Assembler/PascalAs/Install.sh @@ -0,0 +1,3 @@ +perl Makefile.PL +make +sudo make install diff --git a/Assembler/PascalAs/LICENSE b/Assembler/PascalAs/LICENSE new file mode 100644 index 0000000..6c28fad --- /dev/null +++ b/Assembler/PascalAs/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/Assembler/PascalAs/MANIFEST b/Assembler/PascalAs/MANIFEST new file mode 100644 index 0000000..a25084c --- /dev/null +++ b/Assembler/PascalAs/MANIFEST @@ -0,0 +1,38 @@ +bin/maxas.pl +Changes +lib/MaxAs/Cubin.pm +lib/MaxAs/MaxAs.pm +lib/MaxAs/MaxAsGrammar.pm +LICENSE +Makefile.PL +MANIFEST +microbench/microbench.cpp +microbench/microbench.cu +microbench/microbench.sass +microbench/shared.pl +microbench/shared_lds.sass +microbench/shared_sts16.sass +microbench/throughput.pl +microbench/throughput.sass +microbench/throughput2.pl +microbench/throughput2.sass +microbench/throughput3.pl +microbench/throughput4.pl +microbench/throughput5.pl +microbench/xmad.pl +microbench/xmad2.sass +README.md +sgemm/batched_gemm.xlsx +sgemm/cublas_sgemm.ptx +sgemm/sgemm.cpp +sgemm/sgemm.cu +sgemm/sgemm.pl +sgemm/sgemm.sln +sgemm/sgemm.vcxproj +sgemm/sgemm128.sass +sgemm/sgemm64.sass +sgemm/sgemm_final_128.sass +sgemm/sgemm_final_64.sass +sgemm/sgemm_pre_128.sass +sgemm/sgemm_pre_64.sass +t/MaxAs-MaxAs.t diff --git a/Assembler/PascalAs/MYMETA.json b/Assembler/PascalAs/MYMETA.json new file mode 100644 index 0000000..ee7458f --- /dev/null +++ b/Assembler/PascalAs/MYMETA.json @@ -0,0 +1,42 @@ +{ + "abstract" : "Assembler for NVIDIA Maxwell architecture", + "author" : [ + "Scott Gray " + ], + "dynamic_config" : 0, + "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001", + "license" : [ + "mit" + ], + "meta-spec" : { + "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", + "version" : "2" + }, + "name" : "PascalAs-PascalAs", + "no_index" : { + "directory" : [ + "t", + "inc" + ] + }, + "prereqs" : { + "build" : { + "requires" : { + "ExtUtils::MakeMaker" : "0" + } + }, + "configure" : { + "requires" : { + "ExtUtils::MakeMaker" : "0" + } + }, + "runtime" : { + "requires" : { + "Carp" : "1.29", + "Data::Dumper" : "2.145" + } + } + }, + "release_status" : "stable", + "version" : "1.06" +} diff --git a/Assembler/PascalAs/MYMETA.yml b/Assembler/PascalAs/MYMETA.yml new file mode 100644 index 0000000..77a3de3 --- /dev/null +++ b/Assembler/PascalAs/MYMETA.yml @@ -0,0 +1,23 @@ +--- +abstract: 'Assembler for NVIDIA Maxwell architecture' +author: + - 'Scott Gray ' +build_requires: + ExtUtils::MakeMaker: '0' +configure_requires: + ExtUtils::MakeMaker: '0' +dynamic_config: 0 +generated_by: 'ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001' +license: mit +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: '1.4' +name: PascalAs-PascalAs +no_index: + directory: + - t + - inc +requires: + Carp: '1.29' + Data::Dumper: '2.145' +version: '1.06' diff --git a/Assembler/PascalAs/Makefile b/Assembler/PascalAs/Makefile new file mode 100644 index 0000000..bef3fb2 --- /dev/null +++ b/Assembler/PascalAs/Makefile @@ -0,0 +1,878 @@ +# This Makefile is for the PascalAs::PascalAs extension to perl. +# +# It was generated automatically by MakeMaker version +# 7.0401 (Revision: 70401) from the contents of +# Makefile.PL. Don't edit this file, edit Makefile.PL instead. +# +# ANY CHANGES MADE HERE WILL BE LOST! +# +# MakeMaker ARGV: () +# + +# MakeMaker Parameters: + +# ABSTRACT_FROM => q[lib/PascalAs/PascalAs.pm] +# AUTHOR => [q[Scott Gray ]] +# BUILD_REQUIRES => { } +# CONFIGURE_REQUIRES => { } +# EXE_FILES => [q[bin/pascalas.pl]] +# LICENSE => q[MIT] +# NAME => q[PascalAs::PascalAs] +# PREREQ_PM => { Carp=>q[1.29], Data::Dumper=>q[2.145] } +# TEST_REQUIRES => { } +# VERSION_FROM => q[lib/PascalAs/PascalAs.pm] + +# --- MakeMaker post_initialize section: + + +# --- MakeMaker const_config section: + +# These definitions are from config.sh (via /usr/lib/x86_64-linux-gnu/perl/5.22/Config.pm). +# They may have been overridden via Makefile.PL or on the command line. +AR = ar +CC = x86_64-linux-gnu-gcc +CCCDLFLAGS = -fPIC +CCDLFLAGS = -Wl,-E +DLEXT = so +DLSRC = dl_dlopen.xs +EXE_EXT = +FULL_AR = /usr/bin/ar +LD = x86_64-linux-gnu-gcc +LDDLFLAGS = -shared -L/usr/local/lib -fstack-protector-strong +LDFLAGS = -fstack-protector-strong -L/usr/local/lib +LIBC = libc-2.21.so +LIB_EXT = .a +OBJ_EXT = .o +OSNAME = linux +OSVERS = 3.16.0 +RANLIB = : +SITELIBEXP = /usr/local/share/perl/5.22.1 +SITEARCHEXP = /usr/local/lib/x86_64-linux-gnu/perl/5.22.1 +SO = so +VENDORARCHEXP = /usr/lib/x86_64-linux-gnu/perl5/5.22 +VENDORLIBEXP = /usr/share/perl5 + + +# --- MakeMaker constants section: +AR_STATIC_ARGS = cr +DIRFILESEP = / +DFSEP = $(DIRFILESEP) +NAME = PascalAs::PascalAs +NAME_SYM = PascalAs_PascalAs +VERSION = 1.06 +VERSION_MACRO = VERSION +VERSION_SYM = 1_06 +DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\" +XS_VERSION = 1.06 +XS_VERSION_MACRO = XS_VERSION +XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\" +INST_ARCHLIB = blib/arch +INST_SCRIPT = blib/script +INST_BIN = blib/bin +INST_LIB = blib/lib +INST_MAN1DIR = blib/man1 +INST_MAN3DIR = blib/man3 +MAN1EXT = 1p +MAN3EXT = 3pm +INSTALLDIRS = site +DESTDIR = +PREFIX = $(SITEPREFIX) +PERLPREFIX = /usr +SITEPREFIX = /usr/local +VENDORPREFIX = /usr +INSTALLPRIVLIB = /usr/share/perl/5.22 +DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB) +INSTALLSITELIB = /usr/local/share/perl/5.22.1 +DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB) +INSTALLVENDORLIB = /usr/share/perl5 +DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB) +INSTALLARCHLIB = /usr/lib/x86_64-linux-gnu/perl/5.22 +DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB) +INSTALLSITEARCH = /usr/local/lib/x86_64-linux-gnu/perl/5.22.1 +DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH) +INSTALLVENDORARCH = /usr/lib/x86_64-linux-gnu/perl5/5.22 +DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH) +INSTALLBIN = /usr/bin +DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN) +INSTALLSITEBIN = /usr/local/bin +DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN) +INSTALLVENDORBIN = /usr/bin +DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN) +INSTALLSCRIPT = /usr/bin +DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT) +INSTALLSITESCRIPT = /usr/local/bin +DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT) +INSTALLVENDORSCRIPT = /usr/bin +DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT) +INSTALLMAN1DIR = /usr/share/man/man1 +DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR) +INSTALLSITEMAN1DIR = /usr/local/man/man1 +DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR) +INSTALLVENDORMAN1DIR = /usr/share/man/man1 +DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR) +INSTALLMAN3DIR = /usr/share/man/man3 +DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR) +INSTALLSITEMAN3DIR = /usr/local/man/man3 +DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR) +INSTALLVENDORMAN3DIR = /usr/share/man/man3 +DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR) +PERL_LIB = /usr/share/perl/5.22 +PERL_ARCHLIB = /usr/lib/x86_64-linux-gnu/perl/5.22 +PERL_ARCHLIBDEP = /usr/lib/x86_64-linux-gnu/perl/5.22 +LIBPERL_A = libperl.a +FIRST_MAKEFILE = Makefile +MAKEFILE_OLD = Makefile.old +MAKE_APERL_FILE = Makefile.aperl +PERLMAINCC = $(CC) +PERL_INC = /usr/lib/x86_64-linux-gnu/perl/5.22/CORE +PERL_INCDEP = /usr/lib/x86_64-linux-gnu/perl/5.22/CORE +PERL = "/usr/bin/perl" +FULLPERL = "/usr/bin/perl" +ABSPERL = $(PERL) +PERLRUN = $(PERL) +FULLPERLRUN = $(FULLPERL) +ABSPERLRUN = $(ABSPERL) +PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +PERL_CORE = 0 +PERM_DIR = 755 +PERM_RW = 644 +PERM_RWX = 755 + +MAKEMAKER = /usr/share/perl/5.22/ExtUtils/MakeMaker.pm +MM_VERSION = 7.0401 +MM_REVISION = 70401 + +# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle). +# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle) +# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar) +# DLBASE = Basename part of dynamic library. May be just equal BASEEXT. +MAKE = make +FULLEXT = PascalAs/PascalAs +BASEEXT = PascalAs +PARENT_NAME = PascalAs +DLBASE = $(BASEEXT) +VERSION_FROM = lib/PascalAs/PascalAs.pm +OBJECT = +LDFROM = $(OBJECT) +LINKTYPE = dynamic +BOOTDEP = + +# Handy lists of source code files: +XS_FILES = +C_FILES = +O_FILES = +H_FILES = +MAN1PODS = +MAN3PODS = lib/PascalAs/PascalAs.pm + +# Where is the Config information that we are using/depend on +CONFIGDEP = $(PERL_ARCHLIBDEP)$(DFSEP)Config.pm $(PERL_INCDEP)$(DFSEP)config.h + +# Where to build things +INST_LIBDIR = $(INST_LIB)/PascalAs +INST_ARCHLIBDIR = $(INST_ARCHLIB)/PascalAs + +INST_AUTODIR = $(INST_LIB)/auto/$(FULLEXT) +INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT) + +INST_STATIC = +INST_DYNAMIC = +INST_BOOT = + +# Extra linker info +EXPORT_LIST = +PERL_ARCHIVE = +PERL_ARCHIVEDEP = +PERL_ARCHIVE_AFTER = + + +TO_INST_PM = lib/PascalAs/Cubin.pm \ + lib/PascalAs/PascalAs.pm \ + lib/PascalAs/PascalAsGrammar.pm + +PM_TO_BLIB = lib/PascalAs/Cubin.pm \ + blib/lib/PascalAs/Cubin.pm \ + lib/PascalAs/PascalAs.pm \ + blib/lib/PascalAs/PascalAs.pm \ + lib/PascalAs/PascalAsGrammar.pm \ + blib/lib/PascalAs/PascalAsGrammar.pm + + +# --- MakeMaker platform_constants section: +MM_Unix_VERSION = 7.0401 +PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc + + +# --- MakeMaker tool_autosplit section: +# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto +AUTOSPLITFILE = $(ABSPERLRUN) -e 'use AutoSplit; autosplit($$$$ARGV[0], $$$$ARGV[1], 0, 1, 1)' -- + + + +# --- MakeMaker tool_xsubpp section: + + +# --- MakeMaker tools_other section: +SHELL = /bin/sh +CHMOD = chmod +CP = cp +MV = mv +NOOP = $(TRUE) +NOECHO = @ +RM_F = rm -f +RM_RF = rm -rf +TEST_F = test -f +TOUCH = touch +UMASK_NULL = umask 0 +DEV_NULL = > /dev/null 2>&1 +MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' -- +EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' -- +FALSE = false +TRUE = true +ECHO = echo +ECHO_N = echo -n +UNINST = 0 +VERBINST = 0 +MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' -- +DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' -- +UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' -- +WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' -- +MACROSTART = +MACROEND = +USEMAKEFILE = -f +FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' -- +CP_NONEMPTY = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'cp_nonempty' -- + + +# --- MakeMaker makemakerdflt section: +makemakerdflt : all + $(NOECHO) $(NOOP) + + +# --- MakeMaker dist section: +TAR = tar +TARFLAGS = cvf +ZIP = zip +ZIPFLAGS = -r +COMPRESS = gzip --best +SUFFIX = .gz +SHAR = shar +PREOP = $(NOECHO) $(NOOP) +POSTOP = $(NOECHO) $(NOOP) +TO_UNIX = $(NOECHO) $(NOOP) +CI = ci -u +RCS_LABEL = rcs -Nv$(VERSION_SYM): -q +DIST_CP = best +DIST_DEFAULT = tardist +DISTNAME = PascalAs-PascalAs +DISTVNAME = PascalAs-PascalAs-1.06 + + +# --- MakeMaker macro section: + + +# --- MakeMaker depend section: + + +# --- MakeMaker cflags section: + + +# --- MakeMaker const_loadlibs section: + + +# --- MakeMaker const_cccmd section: + + +# --- MakeMaker post_constants section: + + +# --- MakeMaker pasthru section: + +PASTHRU = LIBPERL_A="$(LIBPERL_A)"\ + LINKTYPE="$(LINKTYPE)"\ + LD="$(LD)"\ + PREFIX="$(PREFIX)" + + +# --- MakeMaker special_targets section: +.SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT) + +.PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir + + + +# --- MakeMaker c_o section: + + +# --- MakeMaker xs_c section: + + +# --- MakeMaker xs_o section: + + +# --- MakeMaker top_targets section: +all :: pure_all manifypods + $(NOECHO) $(NOOP) + + +pure_all :: config pm_to_blib subdirs linkext + $(NOECHO) $(NOOP) + +subdirs :: $(MYEXTLIB) + $(NOECHO) $(NOOP) + +config :: $(FIRST_MAKEFILE) blibdirs + $(NOECHO) $(NOOP) + +help : + perldoc ExtUtils::MakeMaker + + +# --- MakeMaker blibdirs section: +blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists + $(NOECHO) $(NOOP) + +# Backwards compat with 6.18 through 6.25 +blibdirs.ts : blibdirs + $(NOECHO) $(NOOP) + +$(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_LIBDIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR) + $(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists + +$(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_ARCHLIB) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB) + $(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists + +$(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_AUTODIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR) + $(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists + +$(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR) + $(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists + +$(INST_BIN)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_BIN) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN) + $(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists + +$(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_SCRIPT) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT) + $(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists + +$(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_MAN1DIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR) + $(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists + +$(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL + $(NOECHO) $(MKPATH) $(INST_MAN3DIR) + $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR) + $(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists + + + +# --- MakeMaker linkext section: + +linkext :: $(LINKTYPE) + $(NOECHO) $(NOOP) + + +# --- MakeMaker dlsyms section: + + +# --- MakeMaker dynamic_bs section: + +BOOTSTRAP = + + +# --- MakeMaker dynamic section: + +dynamic :: $(FIRST_MAKEFILE) $(BOOTSTRAP) $(INST_DYNAMIC) + $(NOECHO) $(NOOP) + + +# --- MakeMaker dynamic_lib section: + + +# --- MakeMaker static section: + +## $(INST_PM) has been moved to the all: target. +## It remains here for awhile to allow for old usage: "make static" +static :: $(FIRST_MAKEFILE) $(INST_STATIC) + $(NOECHO) $(NOOP) + + +# --- MakeMaker static_lib section: + + +# --- MakeMaker manifypods section: + +POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--" +POD2MAN = $(POD2MAN_EXE) + + +manifypods : pure_all \ + lib/PascalAs/PascalAs.pm + $(NOECHO) $(POD2MAN) --section=$(MAN3EXT) --perm_rw=$(PERM_RW) -u \ + lib/PascalAs/PascalAs.pm $(INST_MAN3DIR)/PascalAs::PascalAs.$(MAN3EXT) + + + + +# --- MakeMaker processPL section: + + +# --- MakeMaker installbin section: + +EXE_FILES = bin/pascalas.pl + +pure_all :: $(INST_SCRIPT)/pascalas.pl + $(NOECHO) $(NOOP) + +realclean :: + $(RM_F) \ + $(INST_SCRIPT)/pascalas.pl + +$(INST_SCRIPT)/pascalas.pl : bin/pascalas.pl $(FIRST_MAKEFILE) $(INST_SCRIPT)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists + $(NOECHO) $(RM_F) $(INST_SCRIPT)/pascalas.pl + $(CP) bin/pascalas.pl $(INST_SCRIPT)/pascalas.pl + $(FIXIN) $(INST_SCRIPT)/pascalas.pl + -$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_SCRIPT)/pascalas.pl + + + +# --- MakeMaker subdirs section: + +# none + +# --- MakeMaker clean_subdirs section: +clean_subdirs : + $(NOECHO) $(NOOP) + + +# --- MakeMaker clean section: + +# Delete temporary files but do not touch installed files. We don't delete +# the Makefile here so a later make realclean still has a makefile to use. + +clean :: clean_subdirs + - $(RM_F) \ + $(BASEEXT).bso $(BASEEXT).def \ + $(BASEEXT).exp $(BASEEXT).x \ + $(BOOTSTRAP) $(INST_ARCHAUTODIR)/extralibs.all \ + $(INST_ARCHAUTODIR)/extralibs.ld $(MAKE_APERL_FILE) \ + *$(LIB_EXT) *$(OBJ_EXT) \ + *perl.core MYMETA.json \ + MYMETA.yml blibdirs.ts \ + core core.*perl.*.? \ + core.[0-9] core.[0-9][0-9] \ + core.[0-9][0-9][0-9] core.[0-9][0-9][0-9][0-9] \ + core.[0-9][0-9][0-9][0-9][0-9] lib$(BASEEXT).def \ + mon.out perl \ + perl$(EXE_EXT) perl.exe \ + perlmain.c pm_to_blib \ + pm_to_blib.ts so_locations \ + tmon.out + - $(RM_RF) \ + blib + $(NOECHO) $(RM_F) $(MAKEFILE_OLD) + - $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL) + + +# --- MakeMaker realclean_subdirs section: +realclean_subdirs : + $(NOECHO) $(NOOP) + + +# --- MakeMaker realclean section: +# Delete temporary files (via clean) and also delete dist files +realclean purge :: clean realclean_subdirs + - $(RM_F) \ + $(FIRST_MAKEFILE) $(MAKEFILE_OLD) + - $(RM_RF) \ + $(DISTVNAME) + + +# --- MakeMaker metafile section: +metafile : create_distdir + $(NOECHO) $(ECHO) Generating META.yml + $(NOECHO) $(ECHO) '---' > META_new.yml + $(NOECHO) $(ECHO) 'abstract: '\''Assembler for NVIDIA Maxwell architecture'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'author:' >> META_new.yml + $(NOECHO) $(ECHO) ' - '\''Scott Gray '\''' >> META_new.yml + $(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: '\''0'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: '\''0'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'dynamic_config: 1' >> META_new.yml + $(NOECHO) $(ECHO) 'generated_by: '\''ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'license: mit' >> META_new.yml + $(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml + $(NOECHO) $(ECHO) ' url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml + $(NOECHO) $(ECHO) ' version: '\''1.4'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'name: PascalAs-PascalAs' >> META_new.yml + $(NOECHO) $(ECHO) 'no_index:' >> META_new.yml + $(NOECHO) $(ECHO) ' directory:' >> META_new.yml + $(NOECHO) $(ECHO) ' - t' >> META_new.yml + $(NOECHO) $(ECHO) ' - inc' >> META_new.yml + $(NOECHO) $(ECHO) 'requires:' >> META_new.yml + $(NOECHO) $(ECHO) ' Carp: '\''1.29'\''' >> META_new.yml + $(NOECHO) $(ECHO) ' Data::Dumper: '\''2.145'\''' >> META_new.yml + $(NOECHO) $(ECHO) 'version: '\''1.06'\''' >> META_new.yml + -$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml + $(NOECHO) $(ECHO) Generating META.json + $(NOECHO) $(ECHO) '{' > META_new.json + $(NOECHO) $(ECHO) ' "abstract" : "Assembler for NVIDIA Maxwell architecture",' >> META_new.json + $(NOECHO) $(ECHO) ' "author" : [' >> META_new.json + $(NOECHO) $(ECHO) ' "Scott Gray "' >> META_new.json + $(NOECHO) $(ECHO) ' ],' >> META_new.json + $(NOECHO) $(ECHO) ' "dynamic_config" : 1,' >> META_new.json + $(NOECHO) $(ECHO) ' "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001",' >> META_new.json + $(NOECHO) $(ECHO) ' "license" : [' >> META_new.json + $(NOECHO) $(ECHO) ' "mit"' >> META_new.json + $(NOECHO) $(ECHO) ' ],' >> META_new.json + $(NOECHO) $(ECHO) ' "meta-spec" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",' >> META_new.json + $(NOECHO) $(ECHO) ' "version" : "2"' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "name" : "PascalAs-PascalAs",' >> META_new.json + $(NOECHO) $(ECHO) ' "no_index" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "directory" : [' >> META_new.json + $(NOECHO) $(ECHO) ' "t",' >> META_new.json + $(NOECHO) $(ECHO) ' "inc"' >> META_new.json + $(NOECHO) $(ECHO) ' ]' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "prereqs" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "build" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "requires" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "ExtUtils::MakeMaker" : "0"' >> META_new.json + $(NOECHO) $(ECHO) ' }' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "configure" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "requires" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "ExtUtils::MakeMaker" : "0"' >> META_new.json + $(NOECHO) $(ECHO) ' }' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "runtime" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "requires" : {' >> META_new.json + $(NOECHO) $(ECHO) ' "Carp" : "1.29",' >> META_new.json + $(NOECHO) $(ECHO) ' "Data::Dumper" : "2.145"' >> META_new.json + $(NOECHO) $(ECHO) ' }' >> META_new.json + $(NOECHO) $(ECHO) ' }' >> META_new.json + $(NOECHO) $(ECHO) ' },' >> META_new.json + $(NOECHO) $(ECHO) ' "release_status" : "stable",' >> META_new.json + $(NOECHO) $(ECHO) ' "version" : "1.06"' >> META_new.json + $(NOECHO) $(ECHO) '}' >> META_new.json + -$(NOECHO) $(MV) META_new.json $(DISTVNAME)/META.json + + +# --- MakeMaker signature section: +signature : + cpansign -s + + +# --- MakeMaker dist_basics section: +distclean :: realclean distcheck + $(NOECHO) $(NOOP) + +distcheck : + $(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck + +skipcheck : + $(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck + +manifest : + $(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest + +veryclean : realclean + $(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old + + + +# --- MakeMaker dist_core section: + +dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE) + $(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \ + -e ' if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' -- + +tardist : $(DISTVNAME).tar$(SUFFIX) + $(NOECHO) $(NOOP) + +uutardist : $(DISTVNAME).tar$(SUFFIX) + uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu + $(NOECHO) $(ECHO) 'Created $(DISTVNAME).tar$(SUFFIX)_uu' + +$(DISTVNAME).tar$(SUFFIX) : distdir + $(PREOP) + $(TO_UNIX) + $(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME) + $(RM_RF) $(DISTVNAME) + $(COMPRESS) $(DISTVNAME).tar + $(NOECHO) $(ECHO) 'Created $(DISTVNAME).tar$(SUFFIX)' + $(POSTOP) + +zipdist : $(DISTVNAME).zip + $(NOECHO) $(NOOP) + +$(DISTVNAME).zip : distdir + $(PREOP) + $(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME) + $(RM_RF) $(DISTVNAME) + $(NOECHO) $(ECHO) 'Created $(DISTVNAME).zip' + $(POSTOP) + +shdist : distdir + $(PREOP) + $(SHAR) $(DISTVNAME) > $(DISTVNAME).shar + $(RM_RF) $(DISTVNAME) + $(NOECHO) $(ECHO) 'Created $(DISTVNAME).shar' + $(POSTOP) + + +# --- MakeMaker distdir section: +create_distdir : + $(RM_RF) $(DISTVNAME) + $(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \ + -e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');" + +distdir : create_distdir distmeta + $(NOECHO) $(NOOP) + + + +# --- MakeMaker dist_test section: +disttest : distdir + cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL + cd $(DISTVNAME) && $(MAKE) $(PASTHRU) + cd $(DISTVNAME) && $(MAKE) test $(PASTHRU) + + + +# --- MakeMaker dist_ci section: + +ci : + $(PERLRUN) "-MExtUtils::Manifest=maniread" \ + -e "@all = keys %{ maniread() };" \ + -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \ + -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});" + + +# --- MakeMaker distmeta section: +distmeta : create_distdir metafile + $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'exit unless -e q{META.yml};' \ + -e 'eval { maniadd({q{META.yml} => q{Module YAML meta-data (added by MakeMaker)}}) }' \ + -e ' or print "Could not add META.yml to MANIFEST: $$$${'\''@'\''}\n"' -- + $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'exit unless -f q{META.json};' \ + -e 'eval { maniadd({q{META.json} => q{Module JSON meta-data (added by MakeMaker)}}) }' \ + -e ' or print "Could not add META.json to MANIFEST: $$$${'\''@'\''}\n"' -- + + + +# --- MakeMaker distsignature section: +distsignature : create_distdir + $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) }' \ + -e ' or print "Could not add SIGNATURE to MANIFEST: $$$${'\''@'\''}\n"' -- + $(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE + cd $(DISTVNAME) && cpansign -s + + + +# --- MakeMaker install section: + +install :: pure_install doc_install + $(NOECHO) $(NOOP) + +install_perl :: pure_perl_install doc_perl_install + $(NOECHO) $(NOOP) + +install_site :: pure_site_install doc_site_install + $(NOECHO) $(NOOP) + +install_vendor :: pure_vendor_install doc_vendor_install + $(NOECHO) $(NOOP) + +pure_install :: pure_$(INSTALLDIRS)_install + $(NOECHO) $(NOOP) + +doc_install :: doc_$(INSTALLDIRS)_install + $(NOECHO) $(NOOP) + +pure__install : pure_site_install + $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site + +doc__install : doc_site_install + $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site + +pure_perl_install :: all + $(NOECHO) umask 022; $(MOD_INSTALL) \ + "$(INST_LIB)" "$(DESTINSTALLPRIVLIB)" \ + "$(INST_ARCHLIB)" "$(DESTINSTALLARCHLIB)" \ + "$(INST_BIN)" "$(DESTINSTALLBIN)" \ + "$(INST_SCRIPT)" "$(DESTINSTALLSCRIPT)" \ + "$(INST_MAN1DIR)" "$(DESTINSTALLMAN1DIR)" \ + "$(INST_MAN3DIR)" "$(DESTINSTALLMAN3DIR)" + $(NOECHO) $(WARN_IF_OLD_PACKLIST) \ + "$(SITEARCHEXP)/auto/$(FULLEXT)" + + +pure_site_install :: all + $(NOECHO) umask 02; $(MOD_INSTALL) \ + read "$(SITEARCHEXP)/auto/$(FULLEXT)/.packlist" \ + write "$(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist" \ + "$(INST_LIB)" "$(DESTINSTALLSITELIB)" \ + "$(INST_ARCHLIB)" "$(DESTINSTALLSITEARCH)" \ + "$(INST_BIN)" "$(DESTINSTALLSITEBIN)" \ + "$(INST_SCRIPT)" "$(DESTINSTALLSITESCRIPT)" \ + "$(INST_MAN1DIR)" "$(DESTINSTALLSITEMAN1DIR)" \ + "$(INST_MAN3DIR)" "$(DESTINSTALLSITEMAN3DIR)" + $(NOECHO) $(WARN_IF_OLD_PACKLIST) \ + "$(PERL_ARCHLIB)/auto/$(FULLEXT)" + +pure_vendor_install :: all + $(NOECHO) umask 022; $(MOD_INSTALL) \ + "$(INST_LIB)" "$(DESTINSTALLVENDORLIB)" \ + "$(INST_ARCHLIB)" "$(DESTINSTALLVENDORARCH)" \ + "$(INST_BIN)" "$(DESTINSTALLVENDORBIN)" \ + "$(INST_SCRIPT)" "$(DESTINSTALLVENDORSCRIPT)" \ + "$(INST_MAN1DIR)" "$(DESTINSTALLVENDORMAN1DIR)" \ + "$(INST_MAN3DIR)" "$(DESTINSTALLVENDORMAN3DIR)" + + +doc_perl_install :: all + +doc_site_install :: all + $(NOECHO) $(ECHO) Appending installation info to "$(DESTINSTALLSITEARCH)/perllocal.pod" + -$(NOECHO) umask 02; $(MKPATH) "$(DESTINSTALLSITEARCH)" + -$(NOECHO) umask 02; $(DOC_INSTALL) \ + "Module" "$(NAME)" \ + "installed into" $(INSTALLSITELIB) \ + LINKTYPE "$(LINKTYPE)" \ + VERSION "$(VERSION)" \ + EXE_FILES "$(EXE_FILES)" \ + >> "$(DESTINSTALLSITEARCH)/perllocal.pod" + +doc_vendor_install :: all + + +uninstall :: uninstall_from_$(INSTALLDIRS)dirs + $(NOECHO) $(NOOP) + +uninstall_from_perldirs :: + +uninstall_from_sitedirs :: + $(NOECHO) $(UNINSTALL) "$(SITEARCHEXP)/auto/$(FULLEXT)/.packlist" + +uninstall_from_vendordirs :: + + +# --- MakeMaker force section: +# Phony target to force checking subdirectories. +FORCE : + $(NOECHO) $(NOOP) + + +# --- MakeMaker perldepend section: + + +# --- MakeMaker makefile section: +# We take a very conservative approach here, but it's worth it. +# We move Makefile to Makefile.old here to avoid gnu make looping. +$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP) + $(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?" + $(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..." + -$(NOECHO) $(RM_F) $(MAKEFILE_OLD) + -$(NOECHO) $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) + - $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL) + $(PERLRUN) Makefile.PL + $(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <==" + $(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command. <==" + $(FALSE) + + + +# --- MakeMaker staticmake section: + +# --- MakeMaker makeaperl section --- +MAP_TARGET = perl +FULLPERL = "/usr/bin/perl" + +$(MAP_TARGET) :: static $(MAKE_APERL_FILE) + $(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@ + +$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib + $(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET) + $(NOECHO) $(PERLRUNINST) \ + Makefile.PL DIR="" \ + MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \ + MAKEAPERL=1 NORECURS=1 CCCDLFLAGS= + + +# --- MakeMaker test section: + +TEST_VERBOSE=0 +TEST_TYPE=test_$(LINKTYPE) +TEST_FILE = test.pl +TEST_FILES = t/*.t +TESTDB_SW = -d + +testdb :: testdb_$(LINKTYPE) + +test :: $(TEST_TYPE) subdirs-test + +subdirs-test :: + $(NOECHO) $(NOOP) + + +test_dynamic :: pure_all + PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-MExtUtils::Command::MM" "-MTest::Harness" "-e" "undef *Test::Harness::Switches; test_harness($(TEST_VERBOSE), '$(INST_LIB)', '$(INST_ARCHLIB)')" $(TEST_FILES) + +testdb_dynamic :: pure_all + PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE) + +test_ : test_dynamic + +test_static :: test_dynamic +testdb_static :: testdb_dynamic + + +# --- MakeMaker ppd section: +# Creates a PPD (Perl Package Description) for a binary distribution. +ppd : + $(NOECHO) $(ECHO) '' > $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' Assembler for NVIDIA Maxwell architecture' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' Scott Gray <sgray@nervanasys.com>' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) '' >> $(DISTNAME).ppd + + +# --- MakeMaker pm_to_blib section: + +pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM) + $(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \ + lib/PascalAs/Cubin.pm blib/lib/PascalAs/Cubin.pm \ + lib/PascalAs/PascalAs.pm blib/lib/PascalAs/PascalAs.pm \ + lib/PascalAs/PascalAsGrammar.pm blib/lib/PascalAs/PascalAsGrammar.pm + $(NOECHO) $(TOUCH) pm_to_blib + + +# --- MakeMaker selfdocument section: + + +# --- MakeMaker postamble section: + + +# End. diff --git a/Assembler/PascalAs/Makefile.PL b/Assembler/PascalAs/Makefile.PL new file mode 100644 index 0000000..6acdeda --- /dev/null +++ b/Assembler/PascalAs/Makefile.PL @@ -0,0 +1,14 @@ +require 5.10.0; +use ExtUtils::MakeMaker; +# See lib/ExtUtils/MakeMaker.pm for details of how to influence +# the contents of the Makefile that is written. +WriteMakefile( + NAME => 'PascalAs::PascalAs', + VERSION_FROM => 'lib/PascalAs/PascalAs.pm', # finds $VERSION + EXE_FILES => ['bin/pascalas.pl'], + PREREQ_PM => {Carp => 1.29, Data::Dumper => 2.145}, + LICENSE => 'MIT', + ($] >= 5.005 ? ## Add these new keywords supported since 5.005 + (ABSTRACT_FROM => 'lib/PascalAs/PascalAs.pm', # retrieve abstract from module + AUTHOR => 'Scott Gray ') : ()), +); diff --git a/Assembler/PascalAs/README.md b/Assembler/PascalAs/README.md new file mode 100644 index 0000000..318aba8 --- /dev/null +++ b/Assembler/PascalAs/README.md @@ -0,0 +1,28 @@ +# MaxAs +Assembler for NVIDIA Maxwell architecture + +To install (system-wide): + + sudo cpanm git://github.com/NervanaSystems/maxas.git + +or + + perl Makefile.PL + make + sudo make install + + +See wiki pages for more information: + +- [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction) +- [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started) +- [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes) +- [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM) + +Related work with lots of additional shader assembly (sass) examples: + +- [NervanaGPU](https://github.com/NervanaSystems/nervanagpu) + +This project is released under the [MIT License](http://opensource.org/licenses/MIT). + +-- Scott Gray diff --git a/Assembler/PascalAs/bin/pascalas.pl b/Assembler/PascalAs/bin/pascalas.pl new file mode 100755 index 0000000..a0f1372 --- /dev/null +++ b/Assembler/PascalAs/bin/pascalas.pl @@ -0,0 +1,286 @@ +#!/usr/bin/perl +use strict; +use PascalAs::Cubin; +use PascalAs::PascalAs; +use Data::Dumper; +use File::Spec; + +require 5.10.0; + +$Data::Dumper::Sortkeys = 1; + +my $mode = shift; + +# List cubin contents +if ($mode =~ /^\-?\-l/i) +{ + my $cubinFile = shift or usage(); + + my $cubin = PascalAs::Cubin->new($cubinFile); + + my $arch = $cubin->arch; + my $class = $cubin->class; + my $asize = $cubin->address_size; + my $kernels = $cubin->listKernels; + my $symbols = $cubin->listSymbols; + + printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize; + + foreach my $ker (sort keys %$kernels) + { + printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)}; + } + foreach my $sym (sort keys %$symbols) + { + printf "Symbol: %s\n", $sym; + } +} +# Test that the assembler can reproduce the op codes this cubin or sass contains +elsif ($mode =~ /^\-?\-t/i) +{ + my $reg = shift if $ARGV[0] =~ /^\-?\-r/i; + my $all = shift if $ARGV[0] =~ /^\-?\-a/i; + my $file = shift or usage(); + my $fh; + # sass file + if (-T $file) + { + open $fh, $file or die "$file: $!"; + } + # cubin file + else + { + my $cubin = PascalAs::Cubin->new($file); + my $arch = $cubin->arch; + + open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!"; + my $first = <$fh>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + } + exit(PascalAs::PascalAs::Test($fh, $reg, $all) ? 1 : 0); +} +# Extract an asm file containing the desired kernel +elsif ($mode =~ /^\-?\-e/i) +{ + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $cubinFile = shift or usage(); + my $asmFile = shift; + my $cubin = PascalAs::Cubin->new($cubinFile); + my $arch = $cubin->arch; + my $kernels = $cubin->listKernels; + + #default the kernel name if not specified. + $kernelName ||= (sort keys %$kernels)[0]; + + my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName"; + + open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_60 -sass -fun $kernelName $cubinFile: $!"; + my $first = <$in>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n"; + + print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt)); + + print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n"; + + print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params}; + + print $out "#\n# Instructions:\n\n"; + + PascalAs::PascalAs::Extract($in, $out, $kernel->{Params}); + + close $out if $asmFile; + close $in; +} +# Extract a kernel from a sass dump +elsif ($mode =~ /^\-?\-s/i) +{ + my $sassFile = shift or usage(); + my $asmFile = shift; + + open my $in, $sassFile or die "$sassFile: $!"; + + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + PascalAs::PascalAs::Extract($in, $out, []); + + close $out if $asmFile; + close $in; +} +# Insert the kernel asm back into the cubin: +elsif ($mode =~ /^\-?\-i/i) +{ + my $nowarn; + if ($ARGV[0] =~ /^\-?\-w/i) + { + $nowarn = shift; + } + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $noReuse = shift if $ARGV[0] =~ /^\-?\-n/i; + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package PascalAs::PascalAs::CODE; our \$$name = '$value';" + } + + my $asmFile = shift or usage(); + my $cubinFile = shift or usage(); + my $newCubin = shift || $cubinFile; + + my $file; + if (open my $fh, $asmFile) + { + local $/; + $file = <$fh>; + close $fh; + } + else { die "$asmFile: $!" } + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + # extract the kernel name from the file + ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName; + die "asm file missing kernel name or is badly formatted" unless $kernelName; + + my $kernel = PascalAs::PascalAs::Assemble($file, $include, !$noReuse, $nowarn); + + my $cubin = PascalAs::Cubin->new($cubinFile); + $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName"; + + $cubin->modifyKernel(%$kernel); + + $cubin->write($newCubin); + + printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n", + @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)}; + +} +# Preprocessing: +elsif ($mode =~ /^\-?\-p/i) +{ + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package PascalAs::PascalAs::CODE; our \$$name = '$value';"; + } + my $debug = shift if $ARGV[0] =~ /^\-?\-d/i; + my $asmFile = shift or usage(); + my $asmFile2 = shift; + + die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2; + + open my $fh, $asmFile or die "$asmFile: $!"; + local $/; + my $file = <$fh>; + close $fh; + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + if ($asmFile2) + { + open $fh, ">$asmFile2" or die "$asmFile2: $!"; + } + else + { + $fh = \*STDOUT; + } + print $fh PascalAs::PascalAs::Preprocess($file, $include, $debug); + close $fh; +} +# get version information +elsif ($mode =~ /^\-?\-v/i) +{ + print "$PascalAs::PascalAs::VERSION\n"; +} +else +{ + print "$mode\n"; + usage(); +} + +exit(0); + + + +sub usage +{ + print < + + Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes. + Also useful for extending the missing grammar rules. Defaults to only showing failures without --all. + With the --reg flag it will show register bank conflicts not hidden by reuse flags. + + pascalas.pl --test|-t [--reg|-r] [--all|-a] + + Extract a single kernel into an asm file from a cubin. + Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin. + + pascalas.pl --extract|-e [--kernel|-k kernel_name] [asm_file] + + Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes. + Include the debug flag to print out detailed scheduler info. + + pascalas.pl --pre|-p [--debug|-d] [new_asm_file] + + Insert the kernel asm back into the cubin. Overwrite existing or create new cubin. + Optionally you can skip register reuse flag auto insertion. This allows you to observe + performance without any reuse or you can use it to set the flags manually in your sass. + + pascalas.pl --insert|-i [--noreuse|-n] [new_cubin_file] + + Display version information and exit: + + pascalas.pl --version|-v + +EOF + exit(1); +} + +__END__ diff --git a/Assembler/PascalAs/blib/arch/.exists b/Assembler/PascalAs/blib/arch/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/arch/auto/MaxAs/MaxAs/.exists b/Assembler/PascalAs/blib/arch/auto/MaxAs/MaxAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/arch/auto/PascalAs/PascalAs/.exists b/Assembler/PascalAs/blib/arch/auto/PascalAs/PascalAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/bin/.exists b/Assembler/PascalAs/blib/bin/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/lib/MaxAs/.exists b/Assembler/PascalAs/blib/lib/MaxAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/lib/MaxAs/Cubin.pm b/Assembler/PascalAs/blib/lib/MaxAs/Cubin.pm new file mode 100644 index 0000000..5900958 --- /dev/null +++ b/Assembler/PascalAs/blib/lib/MaxAs/Cubin.pm @@ -0,0 +1,684 @@ +package MaxAs::Cubin; + +use strict; +use Data::Dumper; + +my @Elf32_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + L entry + L phOffset + L shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf64_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + Q entry + Q phOffset + Q shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf32_PrgHdr = qw( + L type + L offset + L vaddr + L paddr + L fileSize + L memSize + L flags + L align +); +my @Elf64_PrgHdr = qw( + L type + L flags + Q offset + Q vaddr + Q paddr + Q fileSize + Q memSize + Q align +); +my @Elf32_SecHdr = qw( + L name + L type + L flags + L addr + L offset + L size + L link + L info + L align + L entSize +); +my @Elf64_SecHdr = qw( + L name + L type + Q flags + Q addr + Q offset + Q size + L link + L info + Q align + Q entSize +); +my @Elf32_SymEnt = qw( + L name + L value + L size + C info + C other + S shIndx +); +my @Elf64_SymEnt = qw( + L name + C info + C other + S shIndx + Q value + Q size +); +my @symBind = qw(LOCAL GLOBAL WEAK); + +# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C) +my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC); + +$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr; +$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr; +$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr; +$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt; + +$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr; +$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr; +$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr; +$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt; + +$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr ]; +$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ]; +$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ]; +$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ]; + +$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr ]; +$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ]; +$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ]; +$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ]; + +# Load a cubin ELF file +sub new +{ + my ($package, $file) = @_; + + my $cubin = bless { fileName => $file }, $package; + + open my $fh, $file or die "$file: $!"; + binmode($fh); + + # Read in assuming 32 bit header + my $data; + read $fh, $data, 0x34; + my $elfHdr = $cubin->{elfHdr} = {}; + @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data; + + # 1: 32bit, 2: 64bit + my $class = $elfHdr->{fileClass}; + + # re-read in with 64 bit header if needed + if ($class == 2) + { + seek $fh, 0, 0; + read $fh, $data, 0x46; + @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data; + + $cubin->{Class} = 64; + } + else + { + $cubin->{Class} = 32; + } + + # verify sm_50 cubin + $cubin->{Arch} = $elfHdr->{flags} & 0xFF; + die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50; + + $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32; + + # Read in Program Headers + seek $fh, $elfHdr->{phOffset}, 0; + foreach (1 .. $elfHdr->{phNum}) + { + read $fh, $data, $elfHdr->{phEntSize}; + + my %prgHdr = (Indx => $_ - 1); + @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data; + push @{$cubin->{prgHdrs}}, \%prgHdr; + } + + # Read in Section Headers + seek $fh, $elfHdr->{shOffset}, 0; + foreach (1 .. $elfHdr->{shNum}) + { + read $fh, $data, $elfHdr->{shEntSize}; + + my %secHdr = (Indx => $_ - 1); + @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data; + push @{$cubin->{secHdrs}}, \%secHdr; + } + + # Read in Section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $data = ''; + # Skip sections with no data (type NULL or NOBITS) + if ($secHdr->{size} && $secHdr->{type} != 8) + { + seek $fh, $secHdr->{offset}, 0; + read $fh, $data, $secHdr->{size}; + } + # Convert string tables to maps + if ($secHdr->{type} == 3) # STRTAB + { + my $strTab = $secHdr->{StrTab} = {}; + my $indx = 0; + foreach my $str (split "\0", $data) + { + $strTab->{$indx} = $str; + $indx += 1 + length($str); + } + } + # Read in Symbol data + if ($secHdr->{type} == 2) # SYMTAB + { + my $offset = 0; + while ($offset < $secHdr->{size}) + { + my $symEnt = {}; + @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize}); + $offset += $secHdr->{entSize}; + + push @{$secHdr->{SymTab}}, $symEnt; + } + } + # Cache raw data for further processing and writing + $secHdr->{Data} = unpack 'H*', $data; + } + close $fh; + + # Update section headers with their names. Map names directly to headers. + my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab}; + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $secHdr->{Name} = $shStrTab->{$secHdr->{name}}; + $cubin->{$secHdr->{Name}} = $secHdr; + } + + # Update symbols with their names + # For the Global functions, extract kernel meta data + # Populate the kernel hash + my $strTab = $cubin->{'.strtab'}{StrTab}; + foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}}) + { + $symEnt->{Name} = $strTab->{$symEnt->{name}}; + + # Attach symbol to section + my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}]; + $secHdr->{SymbolEnt} = $symEnt; + + # Look for symbols tagged FUNC + if (($symEnt->{info} & 0x0f) == 0x02) + { + # Create a hash of kernels for output + my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr; + + # Extract local/global/weak binding info + $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4]; + + # Extract the kernel instructions + $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ]; + + # Extract the max barrier resource identifier used and add 1. Should be 0-16. + # If a register is used as a barrier resource id, then this value is the max of 16. + $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20; + + # Extract the number of allocated registers for this kernel. + $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24; + + # Extract the size of shared memory this kernel uses. + my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"}; + $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0; + + # Attach constant0 section + $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"}; + + # Extract the kernel parameter data. + my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"}; + if ($paramSec) + { + # Extract raw param data + my @data = unpack "L*", pack "H*", $paramSec->{Data}; + + $paramSec->{ParamData} = \@data; + $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ]; + + # Find the first param delimiter + my $idx = 0; + $idx++ while $idx < @data && $data[$idx] != 0x00080a04; + + my $first = $data[$idx+2] & 0xFFFF; + #my $size = $data[$idx+2] >> 16; + $idx += 4; + + my @params; + while ($idx < @data && $data[$idx] == 0x000c1704) + { + # Get the ordinal, offset, size and pointer alignment for each param + my $ord = $data[$idx+2] & 0xFFFF; + my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16); + my $psize = $data[$idx+3] >> 18; + my $align = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0; + unshift @params, "$ord:$offset:$psize:$align"; + $idx += 4; + } + my @staticParams = @data[0 .. ($idx-1)]; + + my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize); + while ($idx < @data) + { + my $code = $data[$idx] & 0xffff; + my $size = $data[$idx] >> 16; + $idx++; + + # EIATTR_MAXREG_COUNT + if ($code == 0x1b03) + { + $maxregCount = $size; + } + # EIATTR_S2RCTAID_INSTR_OFFSETS + elsif ($code == 0x1d04) + { + while ($size > 0) + { + push @ctaidOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_EXIT_INSTR_OFFSETS + elsif ($code == 0x1c04) + { + while ($size > 0) + { + push @exitOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CTAIDZ_USED + elsif ($code == 0x0401) + { + $ctaidzUsed = 1; + } + # EIATTR_REQNTID + elsif ($code == 0x1004) + { + while ($size > 0) + { + push @reqntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_MAX_THREADS + elsif ($code == 0x0504) + { + while ($size > 0) + { + push @maxntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CRS_STACK_SIZE + elsif ($code == 0x1e04) + { + while ($size > 0) + { + push @stackSize, $data[$idx++]; + $size -= 4; + } + } + else + { + printf STDERR "Unknown Code 0x%02x (size:%d)\n", $code, $size; + } + } + $kernelSec->{Params} = \@params; + $kernelSec->{ParamCnt} = scalar @params; + + $paramSec->{StaticParams} = \@staticParams; + $paramSec->{MAXREG_COUNT} = $maxregCount; + $paramSec->{ExitOffsets} = \@exitOffsets; + $paramSec->{CTAIDOffsets} = \@ctaidOffsets; + $paramSec->{CTAIDZUsed} = $ctaidzUsed; + $paramSec->{REQNTID} = \@reqntid; + $paramSec->{MAXNTID} = \@maxntid; + $paramSec->{STACKSIZE} = \@stackSize; + } + # print Dumper($paramSec); + # exit(); + } + # Note GLOBALs found in this cubin + elsif (($symEnt->{info} & 0x10) == 0x10) + { + $cubin->{Symbols}{$symEnt->{Name}} = $symEnt; + } + } + + # print "phOffset: $elfHdr->{phOffset}\n"; + # print "shOffset: $elfHdr->{shOffset}\n"; + # foreach my $secHdr (@{$cubin->{secHdrs}}) + # { + # print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n"; + # } + # my $p = 0; + # foreach my $prgHdr (@{$cubin->{prgHdrs}}) + # { + # print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n"; + # $p++; + # } + # exit(); + + # print Dumper($cubin->{prgHdrs}); + # exit(); + return $cubin; +} +sub class +{ + return shift()->{Class}; +} +sub arch +{ + return shift()->{Arch}; +} +sub address_size +{ + return shift()->{AddressSize}; +} +sub listKernels +{ + return shift()->{Kernels}; +} +sub listSymbols +{ + return shift()->{Symbols}; +} +sub getKernel +{ + my ($cubin, $kernel) = @_; + return $cubin->{Kernels}{$kernel}; +} + +sub modifyKernel +{ + my ($cubin, %params) = @_; + + my $kernelSec = $params{Kernel}; + my $newReg = $params{RegCnt}; + my $newBar = $params{BarCnt}; + my $exitOffsets = $params{ExitOffsets}; + my $ctaidOffsets = $params{CTAIDOffsets}; + my $ctaidzUsed = $params{CTAIDZUsed}; + my $newData = $params{KernelData}; + my $newSize = @$newData * 8; + + die "255 register max" if $newReg > 255; + die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63; + die "16 is max barrier count" if $newBar > 16; + + my $paramSec = $kernelSec->{ParamSec}; + my $kernelName = $kernelSec->{SymbolEnt}{Name}; + my $maxregCount = $paramSec->{MAXREG_COUNT}; + my $stackSize = $paramSec->{STACKSIZE}; + + # update the kernel + $kernelSec->{KernelData} = $newData; + $kernelSec->{Data} = unpack "H*", pack "Q*", @$newData; + + if ($newReg != $kernelSec->{RegCnt}) + { + print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n"; + $kernelSec->{RegCnt} = $newReg; + $kernelSec->{info} &= ~0xff000000; + $kernelSec->{info} |= $newReg << 24; + } + if ($newBar != $kernelSec->{BarCnt}) + { + print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n"; + $kernelSec->{BarCnt} = $newBar; + $kernelSec->{flags} &= ~0x01f00000; + $kernelSec->{flags} |= $newBar << 20; + } + + my @paramData = @{$paramSec->{StaticParams}}; + + if (defined $maxregCount) + { + push @paramData, ($maxregCount << 16) | 0x1b03; + } + + my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets; + my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}}; + + if ($newCTAIDs ne $oldCTAIDs) + { + print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n"; + } + if (@$ctaidOffsets) + { + push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04; + push @paramData, @$ctaidOffsets; + } + + my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets; + my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}}; + + if ($newExits ne $oldExits) + { + print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n"; + } + if (@$exitOffsets) + { + push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04; + push @paramData, @$exitOffsets; + } + + if ($ctaidzUsed != $paramSec->{CTAIDZUsed}) + { + print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n"; + } + if ($ctaidzUsed) + { + push @paramData, 0x0401; + } + + if (@{$paramSec->{REQNTID}}) + { + push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004; + push @paramData, @{$paramSec->{REQNTID}}; + } + if (@{$paramSec->{MAXNTID}}) + { + push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504; + push @paramData, @{$paramSec->{MAXNTID}}; + } + + if (@$stackSize) + { + push @paramData, (scalar(@$stackSize) << 18) | 0x1e04; + push @paramData, @$stackSize; + } + + my $newParamSize = scalar(@paramData)*4; + $paramSec->{Data} = unpack "H*", pack "L*", @paramData; + if ($newParamSize != $paramSec->{size}) + { + print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n"; + $cubin->updateSize($paramSec, $newParamSize); + } + + if ($newSize != $kernelSec->{size}) + { + print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n"; + $cubin->updateSize($kernelSec, $newSize, 1); + } +} + +sub updateSize +{ + my ($cubin, $sec, $newSize, $updatePrgSize) = @_; + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # update section header + my $delta = $newSize - $sec->{size}; + $sec->{size} = $newSize; + + # update symtab section + if ($sec->{SymbolEnt}) + { + $sec->{SymbolEnt}{size} = $newSize; + my $symSection = $cubin->{'.symtab'}; + $symSection->{Data} = ''; + foreach my $symEnt (@{$symSection->{SymTab}}) + { + $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}}; + } + } + + my $pos = $elfHdr->{ehSize}; + my %sizeMap; + + # update section header offsets + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # skip first header + next if $secHdr->{align} == 0; + + # NOBITS data sections are size 0 + my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size}; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pos += $secHdr->{align} - $pad; + } + # map old offset to new + $sizeMap{$secHdr->{offset}} = $pos; + + # update offset + $secHdr->{offset} = $pos; + + # advance position by size + $pos += $size; + } + + # compute total section header size + my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset}; + + # map old offset to new + $sizeMap{$elfHdr->{shOffset}} = $pos; + $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize; + + $elfHdr->{shOffset} = $pos; + $elfHdr->{phOffset} = $pos + $shSize; + + # update program header offsets and sizes + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + # Not sure how best to adjust these so just assume they'll track other offsets. + $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}}; + + # If the kernel sizes changes, also update the associated ProgramHeader. + # Note that this size is the kernel size plus any constant section sizes. + if ($updatePrgSize && $prgHdr->{type} == 1 && + $sec->{offset} >= $prgHdr->{offset} && + $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta) + { + $prgHdr->{fileSize} += $delta; + $prgHdr->{memSize} += $delta; + } + } +} + +# Write out the cubin after modifying it. +sub write +{ + my ($cubin, $file) = @_; + + open my $fh, ">$file" or die "Error: could not open $file for writing: $!"; + binmode($fh); + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # write elf header + print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}}; + my $pos = $elfHdr->{ehSize}; + + # write section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # Skip NULL and NOBITS data sections + next if $secHdr->{size} == 0 || $secHdr->{type} == 8; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pad = $secHdr->{align} - $pad; + print $fh join '', "\0" x $pad; + $pos += $pad; + } + + print $fh pack 'H*', $secHdr->{Data}; + $pos += $secHdr->{size}; + } + + # write section headers + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}}; + } + + #write program headers + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}}; + } + close $fh; +} + +__END__ + diff --git a/Assembler/PascalAs/blib/lib/MaxAs/MaxAs.pm b/Assembler/PascalAs/blib/lib/MaxAs/MaxAs.pm new file mode 100644 index 0000000..f421cf3 --- /dev/null +++ b/Assembler/PascalAs/blib/lib/MaxAs/MaxAs.pm @@ -0,0 +1,1407 @@ +package MaxAs::MaxAs; + +require 5.10.0; + +use strict; +use Data::Dumper; +use MaxAs::MaxAsGrammar; +use File::Spec; +use Carp; + +our $VERSION = '1.06'; + +# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump +my %relOffset = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT); + +# these ops use absolute addresses +my %absOffset = map { $_ => 1 } qw(JCAL); + +my %jumpOp = (%relOffset, %absOffset); + +# These instructions use r0 but do not write to r0 +my %noDest = map { $_ => 1 } qw(ST STG STS STL RED); + +# Map register slots to reuse control codes +my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4); + +# Preprocess and Assemble a source file +sub Assemble +{ + my ($file, $include, $doReuse, $nowarn) = @_; + + my $regMap = {}; + $file = Preprocess($file, $include, 0, $regMap); + my $vectors = delete $regMap->{__vectors}; + my $regBank = delete $regMap->{__regbank}; + + # initialize cubin counts + my $regCnt = 0; + my $barCnt = 0; + + my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse); + + # initialize the first control instruction + push @instructs, $ctrl = {}; + + foreach my $line (split "\n", $file) + { + # keep track of line nums in the physical file + $lineNum++; + + next unless preProcessLine($line); + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # Save us from crashing the display driver + die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n" + if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0; + + # track branches/jumps/calls/etc for label remapping + push @branches, @instructs+0 if exists $jumpOp{$inst->{op}}; + + # push the control code onto the control instruction + push @{$ctrl->{ctrl}}, $inst->{ctrl}; + + # now point the instruction to its associated control instruction + $inst->{ctrl} = $ctrl; + + # add the op name and full instruction text + push @instructs, $inst; + + # add a 4th control instruction for every 3 instructions + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + # map the label name to the index of the instruction about to be inserted + $labels{$1} = @instructs+0; + } + else + { + die "badly formed line at $lineNum: $line\n"; + } + } + # add the final BRA op and align the number of instructions to a multiple of 8 + push @{$ctrl->{ctrl}}, 0x007ff; + push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' }; + while (@instructs & 7) + { + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + push @{$ctrl->{ctrl}}, 0x007e0; + push @instructs, { op => 'NOP', inst => 'NOP;' }; + } + + # remap labels + foreach my $i (@branches) + { + if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1}) + { die "instruction has invalid label: $instructs[$i]{inst}"; } + + $instructs[$i]{jump} = $labels{$1}; + + if (exists $relOffset{$instructs[$i]{op}}) + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; } + else + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; } + } + + # calculate optimal register reuse + # This effects register bank decisions so do it before analyzing register use + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + if ($doReuse) + { + # get any vector registers for r0 + my @r0 = getVecRegisters($vectors, $capData); + + # There are 2 reuse slots per register slot + # The reuse hash points to most recent instruction index where register was last used in this slot + + # For writes to a register, clear any reuse opportunity + if (@r0 && !exists $noDest{$op}) + { + foreach my $slot (keys %reuseSlots) + { + if (my $reuse = $reuse{$slot}) + { + # if writing with a vector op, clear all linked registers + delete $reuse->{$_} foreach @r0; + } + } + } + # clear cache if jumping elsewhere + %reuse = () if exists $jumpOp{$op}; + + # only track register reuse for instruction types this works with + if ($gram->{type}{reuse}) + { + foreach my $slot (keys %reuseSlots) + { + next unless exists $capData->{$slot}; + + my $r = $capData->{$slot}; + next if $r eq 'RZ'; + next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction + + my $reuse = $reuse{$slot} ||= {}; + + # if this register was previously marked for potential reuse + if (my $p = $reuse->{$r}) + { + # flag the previous instruction's ctrl reuse array slot + $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot}; + + #print "reuse $slot $r $instructs[$p]{inst}\n"; + } + # list full, delete the oldest + elsif (keys %$reuse > 2) + { + my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0]; + delete $reuse->{$oldest}; + } + # mark the new instruction for potential reuse + $reuse->{$r} = $i; + } + } + } + # if reuse is disabled then pull value from code. + elsif ($gram->{type}{reuse}) + { + $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData); + } + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # Assign registers to requested banks if possible + foreach my $r (sort keys %$regBank) + { + my $bank = $regBank->{$r}; + my $avail = $regMap->{$r}; + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + last; + } + } + } + + # calculate register live times and preferred banks for non-fixed registers. + # LiveTime only half implemented... + my (%liveTime, %pairedBanks, %reuseHistory); + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + my $reuseType = $gram->{type}{reuse}; + + # liveTimes and bank conflicts with source operands + my (%addReuse, %delReuse); + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r}; + + # All registers should be written prior to being read.. + if (my $liveTime = $liveTime{$liveR}) + { + # for each read set the current instruction index as the high value + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + + # Is this register active in the reuse cache? + my $slotHist = $reuseHistory{$slot} ||= {}; + my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0; + + #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3'; + + # If this is an auto reg, look at the open banks. + # No need to look at banks if this register is in the reuse cache. + if (!$selfReuse && ref $regMap->{$r}) + { + # Look at other source operands in this instruction and flag what banks are being used + foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39)) + { + my $r2 = $capData->{$slot2}; + next if $r2 eq 'RZ' || $r2 eq $r; + + my $slotHist2 = $reuseHistory{$slot2} ||= {}; + + #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3'; + + # Dont be concerned with non-reuse type instructions or + # If this operand is in the reuse cache, we don't care what bank it's on. + if (!$reuseType || !exists $slotHist2->{$r2}) + { + # if the operand is also an auto-allocated register then link them + # Once we choose the bank for one we want to update that choice for the other register. + if (ref $regMap->{$r2}) + { + push @{$pairedBanks{$r}{pairs}}, $r2; + $pairedBanks{$r}{banks} ||= []; + } + # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid. + else + { + my $bank = substr($regMap->{$r2},1) & 3; + #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3'; + + $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++; + $pairedBanks{$r}{pairs} ||= []; + } + # Update the total use count for this register. + # This will be the number of times the register is pulled out of the bank. + $pairedBanks{$r}{useCnt}++; + } + } + } + # update the reuse history so we know which bank conflicts we can ignore. + if ($reuseType) + { + # flag these slots for addition or removal from reuseHistory + if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot}) + { $addReuse{$slot} = $r; } + else + { $delReuse{$slot} = $r; } + } + } + # update reuse history after we're done with the instruction (when the flag is actually in effect). + # we don't want to updated it in the middle since that can interfere with the checks, + $reuseHistory{$_}{$addReuse{$_}} = 1 foreach keys %addReuse; + delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse; + + # liveTimes for destination operands and vector registers + foreach my $r0 (getVecRegisters($vectors, $capData)) + { + # fixed register mappings can have aliases so use the actual register value for those. + my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0}; + + # If not writing treat just like a read + if (exists $noDest{$op}) + { + if (my $liveTime = $liveTime{$liveR}) + { + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r0): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + } + # If writing, push a new bracket on this register's stack. + elsif (my $liveTime = $liveTime{$liveR}) + { + if ($i > $liveTime->[$#$liveTime][1]) + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + else + { + # Initialize the liveTime stack for this register. + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + #print Dumper(\%liveTime); exit(1); + + # assign unassigned registers + # sort by most restricted, then most used, then name + foreach my $r (sort { + $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} || + $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} || + $a cmp $b + } keys %pairedBanks) + { + my $banks = $pairedBanks{$r}{banks}; + my $avail = $regMap->{$r}; + + #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail; + + # Pick a bank with zero or the smallest number of conflicts + BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3)) + { + # pick an available register that matches the requested bank + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + + # update bank info for any unassigned pair + $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}}; + last BANK; + } + } + } + } + # Now assign any remaining to first available + foreach my $r (sort keys %$regMap) + { + if (ref($regMap->{$r}) eq 'ARRAY') + { + $regMap->{$r} = 'R' . shift @{$regMap->{$r}}; + } + } + #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap; + + # apply the register mapping and assemble the instructions to op codes + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + # save the original and replace the register names with numbers + $instructs[$i]{orig} = $instructs[$i]{inst}; + $instructs[$i]{inst} =~ s/(?{$1}) ? $regMap->{$1} : $1 /ge; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + # update the register count + foreach my $r (qw(r0 r8 r20 r39)) + { + next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ'; + + # get numeric portion of regname + my $val = substr $capData->{$r}, 1; + + my @r0 = getVecRegisters($vectors, $capData); + my @r8 = getAddrVecRegisters($vectors, $capData); + + # smart enough to count vector registers for memory instructions. + my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1; + my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1; + + if ($val + $regInc > $regCnt) + { + $regCnt = $val + $regInc; + #print "$val $regCnt $regInc\n"; + } + } + # update the barrier resource count + if ($op eq 'BAR') + { + if (exists $capData->{i8w4}) + { + $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt; + } + # if a barrier value is a register, assume the maximum + elsif (exists $capData->{r8}) + { + $barCnt = 16; + } + } + # Generate the op code. + my ($code, $reuse) = genCode($op, $gram, $capData); + $instructs[$i]{code} = $code; + + # cache this for final pass when we want to calculate reuse stats. + if ($gram->{type}{reuse}) + { $instructs[$i]{caps} = $capData; } + # use the parsed value of reuse for non-reuse type instructions + else + { $ctrl->{reuse}[($i & 3) - 1] = $reuse; } + + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # final pass to piece together control codes + my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed); + foreach my $i (0 .. $#instructs) + { + # op code + if ($i & 3) + { + push @codes, $instructs[$i]{code}; + + if ($instructs[$i]{caps}) + { + # calculate stats on registers + registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn); + } + if ($instructs[$i]{inst} =~ m'EXIT') + { + push @exitOffsets, (scalar(@codes)-1)*8; + } + elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)') + { + push @ctaidOffsets, (scalar(@codes)-1)*8; + $ctaidzUsed = 1 if $1 eq 'Z'; + } + } + # control code + else + { + my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)}; + push @codes, + ($ctrl->[0] << 0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes + ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59); # reuse codes + } + } + + # return the kernel data + return { + RegCnt => $regCnt, + BarCnt => $barCnt, + ExitOffsets => \@exitOffsets, + CTAIDOffsets => \@ctaidOffsets, + CTAIDZUsed => $ctaidzUsed, + ConflictCnt => $reuseHistory{conflicts}, + ReuseCnt => $reuseHistory{reuse}, + ReuseTot => $reuseHistory{total}, + ReusePct => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0), + KernelData => \@codes, + }; +} + +# Useful for testing op code coverage of existing code, extracting new codes and flags +sub Test +{ + my ($fh, $printConflicts, $all) = @_; + + my @instructs; + my %reuseHistory; + my ($pass, $fail) = (0,0); + + while (my $line = <$fh>) + { + my (@ctrl, @reuse); + + next unless processSassCtrlLine($line, \@ctrl, \@reuse); + + foreach my $fileReuse (@reuse) + { + $line = <$fh>; + + my $inst = processSassLine($line) or next; + + $inst->{reuse} = $fileReuse; + my $fileCode = $inst->{code}; + + if (exists $relOffset{$inst->{op}}) + { + # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump + $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e; + } + + my $match = 0; + foreach my $gram (@{$grammar{$inst->{op}}}) + { + my $capData = parseInstruct($inst->{inst}, $gram) or next; + my @caps; + + # Run in test mode to list what capture groups were captured + my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps); + + # Detect register bank conflicts but only for reuse type instructions. + # If a bank conflict is avoided by a reuse flag then ignore it. + registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse}; + + $inst->{caps} = join ', ', sort @caps; + $inst->{codeDiff} = $fileCode ^ $code; + $inst->{reuseDiff} = $fileReuse ^ $reuse; + + # compare calculated and file values + if ($code == $fileCode && $reuse == $fileReuse) + { + $inst->{grade} = 'PASS'; + push @instructs, $inst if $all; + $pass++; + } + else + { + $inst->{grade} = 'FAIL'; + push @instructs, $inst; + $fail++; + } + $match = 1; + last; + } + unless ($match) + { + $inst->{grade} = 'FAIL'; + $inst->{codeDiff} = $fileCode; + $inst->{reuseDiff} = $fileReuse; + push @instructs, $inst; + $fail++; + } + } + } + my %maxLen; + foreach (@instructs) + { + $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}}; + } + my ($lastOp, $template); + foreach my $inst (sort { + $a->{op} cmp $b->{op} || + $a->{codeDiff} <=> $b->{codeDiff} || + $a->{reuseDiff} <=> $b->{reuseDiff} || + $a->{ins} cmp $b->{ins} + } @instructs) + { + if ($lastOp ne $inst->{op}) + { + $lastOp = $inst->{op}; + $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s %s\n"; + printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures); + } + printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)}; + } + my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0; + + printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n", + $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total}; + + return $fail; +} + +# Convert cuobjdump sass to the working format +sub Extract +{ + my ($in, $out, $params) = @_; + + my %paramMap; + my %constants = + ( + blockDimX => 'c[0x0][0x8]', + blockDimY => 'c[0x0][0xc]', + blockDimZ => 'c[0x0][0x10]', + gridDimX => 'c[0x0][0x14]', + gridDimY => 'c[0x0][0x18]', + gridDimZ => 'c[0x0][0x1c]', + ); + print $out "\n"; + + foreach my $const (sort keys %constants) + { + print $out " $const : $constants{$const}\n"; + $paramMap{$constants{$const}} = $const; + } + print $out "\n"; + + foreach my $p (@$params) + { + my ($ord,$offset,$size,$align) = split ':', $p; + + if ($size > 4) + { + my $num = 0; + $offset = hex $offset; + while ($size > 0) + { + my $param = sprintf 'param_%d[%d]', $ord, $num; + my $const = sprintf 'c[0x0][0x%x]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + $size -= 4; + $offset += 4; + $num += 1; + } + } + else + { + my $param = sprintf 'param_%d', $ord; + my $const = sprintf 'c[0x0][%s]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + } + } + print $out "\n\n"; + + my %labels; + my $labelnum = 1; + + my @data; + FILE: while (my $line = <$in>) + { + my (@ctrl, @ruse); + next unless processSassCtrlLine($line, \@ctrl, \@ruse); + + CTRL: foreach my $ctrl (@ctrl) + { + $line = <$in>; + + my $inst = processSassLine($line) or next CTRL; + + # Convert branch/jump/call addresses to labels + if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)') + { + my $target = hex($1); + + # skip the final BRA and stop processing the file + last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8); + + # check to see if we've already generated a label for this target address + my $label = $labels{$target}; + unless ($label) + { + # generate a label name and cache it + $label = $labels{$target} = "TARGET$labelnum"; + $labelnum++; + } + # replace address with name + $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/; + } + $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg; + + $inst->{ctrl} = printCtrl($ctrl); + + push @data, $inst; + } + } + # make a second pass now that we have the complete instruction address to label mapping + foreach my $inst (@data) + { + print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}}; + printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)}; + } +} + +my $CommentRe = qr'^[\t ]*.*?^\s*\n?'ms; +my $IncludeRe = qr'^[\t ]*\n?'ms; +my $CodeRe = qr'^[\t ]*(.*?)^\s*<\/CODE\1>\n?'ms; +my $ConstMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $RegMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $ScheduleRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $InlineRe = qr'\[(\+|\-)(.+?)\1\]'ms; + +sub IncludeFile +{ + my ($file, $include) = @_; + my ($vol,$dir,$name) = File::Spec->splitpath($file); + local $/; + my $fh; + if (!open $fh, $file) + { + open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n"; + } + my $content = <$fh>; + close $fh; + return $content; +} + +sub Preprocess +{ + my ($file, $include, $debug, $regMap) = @_; + + my $constMap = {}; + my $removeRegMap; + if ($regMap) + { $removeRegMap = 1; } + else + { $regMap = {}; } + + # include nested files + 1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg; + + # Strip out comments + $file =~ s|$CommentRe||g; + + # Execute the CODE sections (old way to run code, to be deprecated) + 1 while $file =~ s|$CodeRe| + my $out = eval "package MaxAs::MaxAs::CODE; $2"; + $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg; + + # Execute the inline code (new way) + $file =~ s|$InlineRe| + my ($type, $code) = ($1, $2); + my $out = eval "package MaxAs::MaxAs::CODE; $code"; + $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg; + + #Pull in the constMap + $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg; + + my @newFile; + foreach my $line (split "\n", $file) + { + # skip comments + if ($line !~ m'^\s*(?:#|//).*') + { + $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg; + } + push @newFile, $line; + } + $file = join "\n", @newFile; + + # Pull in the reg map first as the Scheduler will need it to handle vector instructions + # Remove the regmap if we're going on to assemble + $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg; + + # Pick out the SCHEDULE_BLOCK sections + my @schedBlocks = $file =~ /$ScheduleRe/g; + + # Schedule them + foreach my $i (0 .. $#schedBlocks) + { + # XMAD macros should only appear in SCHEDULE_BLOCKs + $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]); + + $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug); + } + + # Replace the results + $file =~ s|$ScheduleRe| shift @schedBlocks |eg; + + return $file; +} + +# break the registers down into source and destination categories for the scheduler +my %srcReg = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X); +my %destReg = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC); +my %regops = (%srcReg, %destReg); +my @itypes = qw(class lat rlat tput dual); + +sub Scheduler +{ + my ($block, $blockNum, $regMap, $debug) = @_; + + my $vectors = $regMap->{__vectors}; + my $lineNum = 0; + + my (@instructs, @comments, $ordered, $first); + foreach my $line (split "\n", $block) + { + # keep track of line nums in the physical file + $lineNum++; + + unless (preProcessLine($line)) + { + push @comments, $line if $line =~ m'\S'; + next; + } + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # if the first instruction in the block is waiting on a dep, it should go first. + $inst->{first} = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1; + + # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block) + #$inst->{first} = $inst->{ctrl} & 0x0000f ? 1 : 2; + $inst->{exeTime} = 0; + $inst->{order} = $ordered++ if $ordered; + push @instructs, $inst; + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n"; + } + # open an ORDERED block + elsif ($line =~ m'^') + { + die "you cannot use nested tags" if $ordered; + $ordered = 1; + } + # close an ORDERED block + elsif ($line =~ m'^') + { + die "missing opening for closing tag" if !$ordered; + $ordered = 0; + } + else + { + die "badly formed line at block: $blockNum line: $lineNum: $line\n"; + } + } + + my (%writes, %reads, @ready, @schedule, $orderedParent); + # assemble the instructions to op codes + foreach my $instruct (@instructs) + { + my $match = 0; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $capData = parseInstruct($instruct->{inst}, $gram) or next; + my (@dest, @src); + + # copy over instruction types for easier access + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + + # A predicate prefix is treated as a source reg + push @src, $instruct->{predReg} if $instruct->{pred}; + + # Handle P2R and R2P specially + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src; + my $mask = hex($capData->{i20w7}); + foreach my $p (0..6) + { + if ($mask & (1 << $p)) + { + push @$list, "P$p"; + } + # make this instruction dependent on any predicates it's not setting + # this is to prevent a race condition for any predicate sets that are pending + elsif ($instruct->{op} eq 'R2P') + { + push @src, "P$p"; + } + } + # These instructions can't be dual issued + $instruct->{nodual} = 1; + } + + # Populate our register source and destination lists, skipping any zero or true values + foreach my $operand (grep { exists $regops{$_} } sort keys %$capData) + { + # figure out which list to populate + my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src; + + # Filter out RZ and PT + my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT'; + + if ($capData->{$operand} ne $badVal) + { + # add the value to list with the correct prefix + push @$list, + $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) : + $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) : + $operand eq 'CC' ? 'CC' : + $operand eq 'X' ? 'CC' : + getRegNum($regMap, $capData->{$operand}); + } + } + $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39}); + + # Find Read-After-Write dependencies + foreach my $src (grep { exists $writes{$_} } @src) + { + # Memory operations get delayed access to registers but not to the predicate + my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat}; + + # the parent should be the most recently added dest op to the stack + foreach my $parent (@{$writes{$src}}) + { + # add this instruction as a child of the parent + # set the edge to the total latency of reg source availability + #print "R $parent->{inst}\n\t\t$instruct->{inst}\n"; + my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat}; + push @{$parent->{children}}, [$instruct, $latency - $regLatency]; + $instruct->{parents}++; + + # if the destination was conditionally executed, we also need to keep going back till it wasn't + last unless $parent->{pred}; + } + } + + # Find Write-After-Read dependencies + foreach my $dest (grep { exists $reads{$_} } @dest) + { + # Flag this instruction as dependent to any previous read + foreach my $reader (@{$reads{$dest}}) + { + # no need to stall for these types of dependencies + #print "W $reader->{inst} \t\t\t $instruct->{inst}\n"; + push @{$reader->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + # Once dependence is marked we can clear out the read list (unless this write was conditional). + # The assumption here is that you would never want to write out a register without + # subsequently reading it in some way prior to writing it again. + delete $reads{$dest} unless $instruct->{pred}; + } + + # Enforce instruction ordering where requested + if ($instruct->{order}) + { + if ($orderedParent) + { + push @{$orderedParent->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + $orderedParent = $instruct; + } + elsif ($orderedParent) + { $orderedParent = 0; } + + # For a dest reg, push it onto the write stack + unshift @{$writes{$_}}, $instruct foreach @dest; + + # For a src reg, push it into the read list + push @{$reads{$_}}, $instruct foreach @src; + + # if this instruction has no dependencies it's ready to go + push @ready, $instruct if !exists $instruct->{parents}; + + $match = 1; + last; + } + die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match; + } + %writes = (); + %reads = (); + + if (@ready) + { + # update dependent counts for sorting hueristic + my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" }; + + countUniqueDescendants($readyParent, {}); + updateDepCounts($readyParent, {}); + + # sort the initial ready list + @ready = sort { + $a->{first} <=> $b->{first} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + # Process the ready list, adding new instructions to the list as we go. + my $clock = 0; + while (my $instruct = shift @ready) + { + my $stall = $instruct->{stall}; + + # apply the stall to the previous instruction + if (@schedule && $stall < 16) + { + my $prev = $schedule[$#schedule]; + + # if stall is greater than 4 then also yield + # the yield flag is required to get stall counts 12-15 working correctly. + $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0; + $prev->{ctrl} |= $stall; + $clock += $stall; + } + # For stalls bigger than 15 we assume the user is managing it with a barrier + else + { + $instruct->{ctrl} &= 0x1fff0; + $instruct->{ctrl} |= 1; + $clock += 1; + } + print "$clock: $instruct->{inst}\n" if $debug; + + # add a new instruction to the schedule + push @schedule, $instruct; + + # update each child with a new earliest execution time + if (my $children = $instruct->{children}) + { + foreach (@$children) + { + my ($child, $latency) = @$_; + + # update the earliest clock value this child can safely execute + my $earliest = $clock + $latency; + $child->{exeTime} = $earliest if $child->{exeTime} < $earliest; + + print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug; + + # decrement parent count and add to ready queue if none remaining. + push @ready, $child if --$child->{parents} < 1; + } + delete $instruct->{children}; + } + + # update stall and mix values in the ready queue on each iteration + foreach my $ready (@ready) + { + # calculate how many instructions this would cause the just added instruction to stall. + $stall = $ready->{exeTime} - $clock; + $stall = 1 if $stall < 1; + + # if using the same compute resource as the prior instruction then limit the throughput + if ($ready->{class} eq $instruct->{class}) + { + $stall = $ready->{tput} if $stall < $ready->{tput}; + } + # dual issue with a simple instruction (tput <= 2) + # can't dual issue two instructions that both load a constant + elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} && + $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const})) + { + $stall = 0; + } + $ready->{stall} = $stall; + + # add an instruction class mixing huristic that catches anything not handled by the stall + $ready->{mix} = $ready->{class} ne $instruct->{class} || 0; + } + + # sort the ready list by stall time, mixing huristic, dependencies and line number + @ready = sort { + $a->{first} <=> $b->{first} || + $a->{stall} <=> $b->{stall} || + $b->{mix} <=> $a->{mix} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + my $out; + #$out .= "$_\n" foreach @comments; + $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule; + return $out; +} + +sub setConstMap +{ + my ($constMap, $constMapText) = @_; + + foreach my $line (split "\n", $constMapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my ($name, $value) = split '\s*:\s*', $line; + + $constMap->{$name} = $value; + } + return; +} + +sub setRegisterMap +{ + my ($regMap, $regmapText) = @_; + + my $vectors = $regMap->{__vectors} ||= {}; + my $regBank = $regMap->{__regbank} ||= {}; + my %aliases; + + foreach my $line (split "\n", $regmapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my $auto = $line =~ /~/; + my $share = $line =~ /=/; + + my ($regNums, $regNames) = split '\s*[:~=]\s*', $line; + + my (@numList, @nameList, %vecAliases); + foreach my $num (split '\s*,\s*', $regNums) + { + my ($start, $stop) = split '\s*\-\s*', $num; + die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop; + push @numList, ($start .. $stop||$start); + } + foreach my $fullName (split '\s*,\s*', $regNames) + { + if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$') + { + my ($name1, $name2, $bank) = ($1, $3, $4); + foreach (split '\s*\|\s*', $2) + { + my ($start, $stop) = split '\s*\-\s*'; + foreach my $r (map "$name1$_$name2", $start .. $stop||$start) + { + # define an alias for use in vector instructions that omits the number portion + $aliases{$r} = "$name1$name2" unless exists $aliases{$r}; + push @nameList, $r; + $regBank->{$r} = $bank if $auto && defined $bank; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank; + } + } + } + elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$') + { + push @nameList, $1; + $regBank->{$1} = $2 if $auto && defined $2; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2; + } + else + { + die "Bad register name: '$fullName' at: $line\n"; + } + } + die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList; + die "Missmatched register mapping at: $line\n" if $share && @numList > 1; + + # detect if this list is monotonically ascending with no gaps + my $i = 0; + while ($i < $#numList-1) + { + last if $numList[$i] + 1 != $numList[$i+1]; + $i++; + } + my $ascending = $i+1 == $#numList; + + foreach my $n (0..$#nameList) + { + die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]}; + + if ($auto) + { + # assign possible values to be assigned on assembly + $regMap->{$nameList[$n]} = \@numList; + } + elsif ($share) + { + # each name shares the same single register + $regMap->{$nameList[$n]} = 'R' . $numList[0]; + } + else + { + $regMap->{$nameList[$n]} = 'R' . $numList[$n]; + # flag any even register as a potential vector + if ($ascending && ($numList[$n] & 1) == 0) + { + # constrain potential range to vector alignment + my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3); + if ($end <= $#nameList) + { + $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ]; + #setup an alias for the base name without the number + if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}}) + { + $regMap->{$aliases{$nameList[$n]}} = $regMap->{$nameList[$n]}; + $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]}; + delete $aliases{$nameList[$n]}; + } + } + } + } + } + } + #print Dumper($regMap); exit(1); +} + +sub preProcessLine +{ + # strip leading space + $_[0] =~ s|^\s+||; + + # preserve comment but check for emptiness + my $val = shift; + + # strip comments + $val =~ s{(?:#|//).*}{}; + + # skip blank lines + return $val =~ m'\S'; +} + +# traverse the graph and count total descendants per node. +# only count unique nodes (by lineNum) +sub countUniqueDescendants +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + + $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges); + } + } + else + { + return $node->{lineNum}; + } + return ($node->{lineNum}, keys %{$node->{deps}}); +} +# convert hash to count for easier sorting. +sub updateDepCounts +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (@$children) + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + updateDepCounts($child->[0], $edges); + } + } + $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0; +} + +# Detect register bank conflicts and calculate reuse stats +sub registerHealth +{ + my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_; + + my (@banks, @conflicts); + + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $slotHist = $reuseHistory->{$slot} ||= {}; + + $reuseHistory->{total}++; + + # if this register is in active reuse then ignore for bank conflict checking. + if (exists $slotHist->{$r}) + { + $reuseHistory->{reuse}++; + } + else + { + # extract number from reg and take the modulo-4 value. This is the bank id. + my $bank = substr($r,1) & 3; + + # check for conflict + if ($banks[$bank] && $banks[$bank] ne $r) + { + push @conflicts, $banks[$bank] if !@conflicts; + push @conflicts, $r; + + $reuseHistory->{conflicts}++; + } + $banks[$bank] = $r; + } + + # update the history + if ($reuseFlags & $reuseSlots{$slot}) + { $slotHist->{$r} = 1; } + else + { delete $slotHist->{$r}; } + } + if ($inst && @conflicts && !$nowarn) + { + printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts); + } + return scalar @conflicts; +} + +1; + +__END__ + +=head1 NAME + +MaxAs::MaxAs - Assembler for NVIDIA Maxwell architecture + +=head1 SYNOPSIS + + maxas.pl [opts] + +=head1 DESCRIPTION + +See the documentation at: https://github.com/NervanaSystems/maxas + +=head1 SEE ALSO + +See the documentation at: https://github.com/NervanaSystems/maxas + + +=head1 AUTHOR + +Scott Gray, Esgray@nervanasys.com + +=head1 COPYRIGHT AND LICENSE + +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +=cut diff --git a/Assembler/PascalAs/blib/lib/MaxAs/MaxAsGrammar.pm b/Assembler/PascalAs/blib/lib/MaxAs/MaxAsGrammar.pm new file mode 100644 index 0000000..fc61543 --- /dev/null +++ b/Assembler/PascalAs/blib/lib/MaxAs/MaxAsGrammar.pm @@ -0,0 +1,1437 @@ +package MaxAs::MaxAsGrammar; + +use strict; +use Carp; +use Exporter; +use Data::Dumper; +our @ISA = qw(Exporter); + +our @EXPORT = qw( + %grammar %flags + parseInstruct genCode genReuseCode + processAsmLine processSassLine processSassCtrlLine + replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters +); + +require 5.10.0; + +# Helper functions for operands +sub getI +{ + my ($orig, $pos, $mask) = @_; + my $val = $orig; + my $neg = $val =~ s|^\-||; + + # parse out our custom index immediates for addresses + if ($val =~ m'^(\d+)[xX]<([^>]+)>') + { + # allow any perl expression and multiply result by leading decimal. + # also allow global scalar varibles in the expression. + my $mul = $1; + my $exp = $2; + # strip leading zeros (don't interpret numbers as octal) + $exp =~ s/(?> $trunc) & 0x7ffff if $trunc; + } + return $val << $pos; +} +sub getR +{ + my ($val, $pos) = @_; + if ($val =~ m'^R(\d+|Z)$' && $1 < 255) + { + $val = $1 eq 'Z' ? 0xff : $1; + } + else + { + die "Bad register name found: $val\n"; + } + return $val << $pos; +} +sub getP +{ + my ($val, $pos) = @_; + if ($val =~ m'^P(\d|T)$' && $1 < 7) + { + $val = $1 eq 'T' ? 7 : $1; + } + else + { + die "Bad predicate name found: $val\n"; + } + return $val << $pos; +} +sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 } + +# Map operands into their value and position in the op code. +my %operands = +( + p0 => sub { getP($_[0], 0) }, + p3 => sub { getP($_[0], 3) }, + p12 => sub { getP($_[0], 12) }, + p29 => sub { getP($_[0], 29) }, + p39 => sub { getP($_[0], 39) }, + p45 => sub { getP($_[0], 45) }, + p48 => sub { getP($_[0], 48) }, + p58 => sub { getP($_[0], 58) }, + r0 => sub { getR($_[0], 0) }, + r8 => sub { getR($_[0], 8) }, + r20 => sub { getR($_[0], 20) }, + r28 => sub { getR($_[0], 28) }, + r39s20 => sub { getR($_[0], 39) }, + r39 => sub { getR($_[0], 39) }, + r39a => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20 + c20 => sub { getC($_[0]) }, + c39 => sub { getC($_[0]) }, + c34 => sub { hex($_[0]) << 34 }, + c36 => sub { hex($_[0]) << 36 }, + f20w32 => sub { getF($_[0], 20, 'f') }, + f20 => sub { getF($_[0], 20, 'f', 12) }, + d20 => sub { getF($_[0], 20, 'd', 44) }, + i8w4 => sub { getI($_[0], 8, 0xf) }, + i20 => sub { getI($_[0], 20, 0x7ffff) }, + i20w6 => sub { getI($_[0], 20, 0x3f) }, + i20w7 => sub { getI($_[0], 20, 0x7f) }, + i20w8 => sub { getI($_[0], 20, 0xff) }, + i20w12 => sub { getI($_[0], 20, 0xfff) }, + i20w24 => sub { getI($_[0], 20, 0xffffff) }, + i20w32 => sub { getI($_[0], 20, 0xffffffff) }, + i31w4 => sub { getI($_[0], 31, 0xf) }, + i34w13 => sub { getI($_[0], 34, 0x1fff) }, + i36w20 => sub { getI($_[0], 36, 0xfffff) }, + i39w8 => sub { getI($_[0], 39, 0xff) }, + i28w8 => sub { getI($_[0], 28, 0xff) }, + i28w20 => sub { getI($_[0], 28, 0xfffff) }, + i48w8 => sub { getI($_[0], 48, 0xff) }, + i51w5 => sub { getI($_[0], 51, 0x1f) }, + i53w5 => sub { getI($_[0], 53, 0x1f) }, +); + +# Rules for operands and their closely tied flags +my $hex = qr"0[xX][0-9a-fA-F]+"; +my $iAddr = qr"\d+[xX]<[^>]+>"; +my $immed = qr"$hex|$iAddr|\d+"o; +my $reg = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\ +my $p = qr"P[0-6T]"; +my $noPred = qr"(?)"; +my $pred = qr"\@(?\!)?P(?[0-6]) "; +my $p0 = qr"(?$p)"o; +my $p3 = qr"(?$p)"o; +my $p12 = qr"(?\!)?(?$p)"o; +my $p29 = qr"(?\!)?(?$p)"o; +my $p39 = qr"(?\!)?(?$p)"o; +my $p45 = qr"(?$p)"o; +my $p48 = qr"(?$p)"o; +my $p58 = qr"(?$p)"o; +my $r0 = qr"(?$reg)"; +my $r0cc = qr"(?$reg)(?\.CC)?"; +my $r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r28 = qr"(?$reg)"; +my $r39s20 = qr"(?\-)?(?\|)?(?(?$reg))\|?(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39 = qr"(?\-)?(?$reg)(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39a = qr"(?(?$reg))(?\.reuse)?"; +my $c20 = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20x = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20s39 = qr"(?\-)?c\[(?$hex)\]\s*\[(?$hex)\]"o; +my $f20w32 = qr"(?(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))"; +my $f20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $d20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $i8w4 = qr"(?$immed)"o; +my $i20 = qr"(?(?\-)?$immed)(?\.NEG)?"o; +my $i20w6 = qr"(?$immed)"o; +my $i20w7 = qr"(?$immed)"o; +my $i20w8 = qr"(?$immed)"o; +my $i20w12 = qr"(?$immed)"o; +my $i20w24 = qr"(?\-?$immed)"o; +my $i20w32 = qr"(?\-?$immed)"o; +my $i39w8 = qr"(?\-?$immed)"o; +my $i28w8 = qr"(?$immed)"o; +my $i28w20 = qr"(?\-?$immed)"o; +my $i31w4 = qr"(?$immed)"o; +my $i34w13 = qr"(?$immed)"o; +my $i36w20 = qr"(?$immed)"o; +my $i48w8 = qr"(?$immed)"o; +my $i51w5 = qr"(?$immed)"o; +my $i53w5 = qr"(?$immed)"o; +my $ir20 = qr"$i20|$r20"o; +my $cr20 = qr"$c20|$r20"o; +my $icr20 = qr"$i20|$c20|$r20"o; +my $fcr20 = qr"$f20|$c20|$r20"o; +my $cr39 = qr"$c20s39|$r39"o; +my $dr20 = qr"$d20|$r20"o; + +# Instruction specific rules for capturing various flags +my $u32 = qr"(?\.U32)?"; +my $ftz = qr"(?\.FTZ)?"; +my $sat = qr"(?\.SAT)?"; +my $rnd = qr"(?:\.(?RN|RM|RP|RZ))?"; +my $round = qr"(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?"; +my $fcmp = qr"(?\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)"; +my $icmp = qr"\.(?LT|EQ|LE|GT|NE|GE)"; +my $bool = qr"\.(?AND|OR|XOR|PASS_B)"; +my $bool2 = qr"\.(?AND|OR|XOR)"; +my $func = qr"\.(?COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)"; +my $rro = qr"\.(?SINCOS|EX2)"; +my $add3 = qr"(?:\.(?X|RS|LS))?"; +my $lopz = qr"(?:\.(?NZ|Z) $p48,|(?))"o; +my $X = qr"(?\.X)?"; +my $tld = qr"(?NODEP\.)?(?:(?T)|(?P))"; +my $chnls = qr"(?R|RGBA)"; +my $sr = qr"SR_(?\S+)"; +my $shf = qr"(?\.W)?(?:\.(?U64|S64))?(?\.HI)?"; +my $xmad = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $xmadc = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $vmad8 = qr"\.(?[SU])(?8|16)\.(?[SU])(?8|16)(?\.PO)?(?\.SHR_7)?(?\.SHR_15)?(?\.SAT)?"; +my $vmad16= qr"\.(?[SU])(?16)\.(?[SU])(?16)"; +my $hilo = qr"(?:\.(?XHI|XLO))?"; +my $vaddType = qr"(?:\.(?UD))?(?:\.(?SD))?(?:\.(?[SU])(?8|16|32))?(?:\.(?[SU])(?8|16|32))?"; +my $vaddMode = qr"(?:\.(?MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?"; +my $vmnmx = qr"(?:\.(?MX))?"; +my $x2x = qr"\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)"; +my $prmt = qr"(?:\.(?F4E|B4E|RC8|ECL|ECR|RC16))?"; +my $shfl = qr"\.(?IDX|UP|DOWN|BFLY)"; +my $bar = qr"\.(?SYNC|ARV|RED)(?:\.(?POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?()|(?))(?(), $p39|(?))"o; +my $b2r = qr"\.RESULT $r0(?:, $p45|(?))"o; +my $dbar = qr"(?SB0|SB1|SB2|SB3|SB4|SB5)"; +my $dbar2 = qr" {(?5)?,?(?4)?,?(?3)?,?(?2)?,?(?1)?,?(?0)?}"; +my $mbar = qr"\.(?CTA|GL|SYS)"; +my $addr = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i20w24)?\]"o; +my $addr2 = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i28w20)?\]"o; +my $ldc = qr"c\[(?$hex)\]\s*$addr"o; +my $atom = qr"(?\.E)?(?:\.(?ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)"; +my $vote = qr"\.(?ALL|ANY|EQ)"o; +my $memType = qr"(?\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)"; +my $memCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CI|CS|CV|IL|WT))?"; + + + +# class: hardware resource that shares characteristics with types +# lat : pipeline depth where relevent, placeholder for memory ops +# blat : barrier latency, typical fetch time for memory operations. Highly variable. +# rlat : operand read latency for memory ops +# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op. +# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession. +# dual : whether this instruction type can be dual issued +# reuse: whether this instruction type accepts register reuse flags. + +# Some of these values are guesses and need to be updated from micro benchmarks. +# We may need to split these classes up further. +my $s2rT = {class => 's2r', lat => 2, blat => 25, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $smemT = {class => 'mem', lat => 2, blat => 30, rlat => 2, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $gmemT = {class => 'mem', lat => 2, blat => 200, rlat => 4, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $x32T = {class => 'x32', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 1}; +my $x64T = {class => 'x64', lat => 2, blat => 128, rlat => 0, rhold => 0, tput => 128, dual => 0, reuse => 1}; +my $shftT = {class => 'shift', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $cmpT = {class => 'cmp', lat => 13, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $qtrT = {class => 'qtr', lat => 8, blat => 0, rlat => 4, rhold => 0, tput => 1, dual => 1, reuse => 0}; +my $rroT = {class => 'rro', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $voteT = {class => 'vote', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; + + +# Create map of op names to rules +our %grammar = +( + #Floating Point Instructions + FADD => [ { type => $x32T, code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FADD32I => [ { type => $x32T, code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o, } ], + FCHK => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o, } ], #Partial? + FCMP => [ { type => $cmpT, code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o, } ], + FFMA => [ + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o, }, + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o, }, + ], + FMNMX => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o, } ], + FMUL => [ { type => $x32T, code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FMUL32I => [ { type => $x32T, code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o, } ], + FSET => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o, } ], + FSETP => [ { type => $cmpT, code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], + MUFU => [ { type => $qtrT, code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o, } ], + RRO => [ { type => $rroT, code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o, } ], + DADD => [ { type => $x64T, code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o, } ], + DFMA => [ { type => $x64T, code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o, } ], + DMNMX => [ { type => $cmpT, code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o, } ], + DMUL => [ { type => $x64T, code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o, } ], + DSET => [ { type => $cmpT, code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o, } ], + DSETP => [ { type => $cmpT, code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o, } ], + FSWZADD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o, } ], #TODO + + HADD2 => [ { type => $x32T, code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o, } ], + HMUL2 => [ { type => $x32T, code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o, } ], + HFMA2 => [ { type => $x32T, code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o, } ], + HSETP2 => [ { type => $cmpT, code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial + + #Integer Instructions + BFE => [ { type => $shftT, code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o, } ], + BFI => [ { type => $shftT, code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o, } ], + FLO => [ { type => $s2rT, code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o, } ], + IADD => [ { type => $x32T, code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o, } ], + IADD32I => [ { type => $x32T, code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o, } ], + IADD3 => [ { type => $x32T, code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o, } ], + ICMP => [ { type => $cmpT, code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o, } ], + IMNMX => [ { type => $shftT, code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o, } ], + ISET => [ { type => $shftT, code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o, } ], + ISETP => [ { type => $cmpT, code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ], + ISCADD => [ { type => $shftT, code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o, } ], + ISCADD32I => [ { type => $shftT, code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o, } ], + LEA => [ + { type => $cmpT, code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o, }, + { type => $shftT, code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o, }, + { type => $shftT, code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o, }, + { type => $shftT, code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o, }, + ], + LOP => [ { type => $x32T, code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?~)?$icr20(?\.INV)?;"o, } ], + LOP32I => [ { type => $x32T, code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o, } ], + LOP3 => [ + { type => $x32T, code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o, }, + { type => $x32T, code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o, }, + ], + POPC => [ { type => $s2rT, code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o, } ], + SHF => [ + { type => $shftT, code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o, }, + { type => $shftT, code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o, }, + ], + SHL => [ { type => $shftT, code => 0x5c48000000000000, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $icr20;"o, } ], + SHR => [ { type => $shftT, code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o, } ], + XMAD => [ + { type => $x32T, code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o, }, + { type => $x32T, code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o, }, + ], + # XMAD replaces these + IMAD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o, } ], #TODO + IMADSP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO + IMUL => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o, } ], #TODO + + #Conversion Instructions + F2F => [ { type => $qtrT, code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ], + F2I => [ { type => $qtrT, code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o, } ], + I2F => [ { type => $qtrT, code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o, } ], + I2I => [ { type => $qtrT, code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o, } ], + + #Movement Instructions + MOV => [ { type => $x32T, code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o, } ], + MOV32I => [ { type => $x32T, code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o, } ], + PRMT => [ { type => $x32T, code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ], + SEL => [ { type => $x32T, code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o, } ], + SHFL => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ], + + #Predicate/CC Instructions + PSET => [ { type => $cmpT, code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o, } ], + PSETP => [ { type => $cmpT, code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ], + CSET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o, } ], #TODO + CSETP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO + P2R => [ { type => $x32T, code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o, } ], + R2P => [ { type => $cmpT, code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o, } ], + + #Texture Instructions + # Handle the commonly used 1D texture functions.. but save the others for later + TLD => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial + TLDS => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial + TEX => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o, } ], #TODO + TLD4 => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o, } ], #TODO + TXQ => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o, } ], #TODO + TEXS => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o, } ], #TODO + TLD4S => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO + + #Compute Load/Store Instructions + LD => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o, } ], + ST => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o, } ], + LDG => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o, } ], + STG => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o, } ], + LDS => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o, } ], + STS => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o, } ], + LDL => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o, } ], + STL => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o, } ], + LDC => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o, } ], + # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded). + ATOM => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + ATOMS => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + RED => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o, } ], + CCTL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o, } ], #TODO + CCTLL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO + CCTLT => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO + + #Surface Memory Instructions (haven't gotten to these yet..) + SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO + SULD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o, } ], #TODO + SURED => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o, } ], #TODO + SUST => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o, } ], #TODO + + #Control Instructions + BRA => [ + { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?\.U)? $i20w24;"o, }, + { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?\.U)? CC\.EQ, $i20w24;"o, }, + ], + BRX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o, } ], #TODO + JMP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o, } ], #TODO + JMX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o, } ], #TODO + SSY => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o, } ], + SYNC => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o, } ], + CAL => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o, } ], + JCAL => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o, } ], + PRET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o, } ], #TODO + RET => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o, } ], + BRK => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o, } ], + PBK => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o, } ], + CONT => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o, } ], + PCNT => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o, } ], + EXIT => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o, } ], + PEXIT => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o, } ], #TODO + BPT => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o, } ], + + #Miscellaneous Instructions + NOP => [ { type => $x32T, code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o, } ], + CS2R => [ { type => $x32T, code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o, } ], + S2R => [ { type => $s2rT, code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o, } ], + B2R => [ { type => $x32T, code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o, } ], + BAR => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o, } ], + DEPBAR => [ + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, }, + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o, }, + ], + MEMBAR => [ { type => $x32T, code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o, } ], + VOTE => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?))$p45, $p39;"o, } ], + R2B => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o, } ], #TODO + + #Video Instructions... Need to finish + VADD => [ { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMAD => [ + { type => $x32T, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, }, + ], + VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMNMX => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + + VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 +); + +# Create map of capture groups to op code flags that need to be added (or removed) +my @flags = grep /\S/, split "\n", q{; + +BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD +0x0100000000000000 neg + +FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP +0x0100000000000000 neg + +PSET, PSETP +0x0000000000008000 p12not +0x0000000100000000 p29not + +FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE +0x0000040000000000 p39not + +IADD, IADD3, XMAD, LEA, IMNMX +0x0000800000000000 CC + +IADD32I +0x0010000000000000 CC + +LEA +0x0000000000000000 X + +SHF +0x0004000000000000 W +0x0001000000000000 HI + +SHF: type +0x0000004000000000 U64 +0x0000006000000000 S64 + +SHR, IMNMX, ISETP, ISET, ICMP, BFE +0x0001000000000000 U32 + +SHL +0x0000008000000000 W + +SHFL +0x0000000010000000 i20w8 +0x0000000020000000 i34w13 + +SHFL: mode +0x0000000000000000 IDX +0x0000000040000000 UP +0x0000000080000000 DOWN +0x00000000c0000000 BFLY + +IMNMX: mode +0x0000080000000000 XLO +0x0000180000000000 XHI + +ISETP, ISET, ICMP: cmp +0x0002000000000000 LT +0x0004000000000000 EQ +0x0006000000000000 LE +0x0008000000000000 GT +0x000a000000000000 NE +0x000c000000000000 GE + +ISETP, ISET, PSETP, PSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +PSETP, PSET: bool2 +0x0000000000000000 AND +0x0000000001000000 OR +0x0000000002000000 XOR + +ISETP, ISET +0x0000080000000000 X + +LOP: bool +0x0000000000000000 AND +0x0000020000000000 OR +0x0000040000000000 XOR +0x0000060000000000 PASS_B + +LOP: +0x0000010000000000 INV + +LOP: z +0x0000200000000000 Z +0x0000300000000000 NZ + +LOP +0x0007000000000000 noz + +LOP32I: bool +0x0000000000000000 AND +0x0020000000000000 OR +0x0040000000000000 XOR + +PRMT: mode +0x0001000000000000 F4E +0x0002000000000000 B4E +0x0003000000000000 RC8 +0x0004000000000000 ECL +0x0005000000000000 ECR +0x0006000000000000 RC16 + +XMAD: type1 +0x0000000000000000 U16 +0x0001000000000000 S16 + +XMAD: type2 +0x0000000000000000 U16 +0x0002000000000000 S16 + +XMAD: mode +0x0000002000000000 MRG +0x0000001000000000 PSL +0x0008000000000000 CHI +0x0004000000000000 CLO +0x000c000000000000 CSFU + +XMAD: modec +0x0004000000000000 CLO +0x0008000000000000 CHI +0x000c000000000000 CSFU +0x0040000000000000 X +0x0080000000000000 PSL +0x0100000000000000 MRG + +XMAD +0x0010000000000000 CBCC + +XMAD: r8part +0x0000000000000000 H0 +0x0020000000000000 H1 + +XMAD: r20part +0x0000000000000000 H0 +0x0000000800000000 H1 + +XMAD: r20partx +0x0000000000000000 H0 +0x0010000000000000 H1 + +XMAD: r39part +0x0000000000000000 H0 +0x0010000000000000 H1 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part +0x0000000000000000 B0 +0x0000001000000000 B1 +0x0000002000000000 B2 +0x0000003000000000 B3 +0x0000001000000000 H1 +0x0000000000000000 H0 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part +0x0000000000000000 B0 +0x0000000010000000 B1 +0x0000000020000000 B2 +0x0000000030000000 B3 +0x0000000010000000 H1 +0x0000000000000000 H0 + +VMAD +0x0040000000000000 r8neg +0x0020000000000000 r39neg +0x0008000000000000 SHR_7 +0x0010000000000000 SHR_15 +0x0060000000000000 PO +0x0080000000000000 SAT + +VMNMX +0x0100000000000000 MX + +VADD, VABSDIFF, VMNMX +0x0080000000000000 SAT +0x0040000000000000 UD +0x0040000000000000 SD + +VSET: cmp +0x0040000000000000 LT +0x0080000000000000 EQ +0x00c0000000000000 LE +0x0100000000000000 GT +0x0140000000000000 NE +0x0180000000000000 GE + +VADD, VSET: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VABSDIFF: mode +0x0003000000000000 ACC +0x000b000000000000 MIN +0x0013000000000000 MAX +0x0023000000000000 MRG_16H +0x002b000000000000 MRG_16L +0x0033000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x003b000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMNMX: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1 +0x0000000000000000 U +0x0001000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2 +0x0000000000000000 U +0x0002000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size1 +0x0000000000000000 8 +0x0000004000000000 16 +0x0000006000000000 32 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size2 +0x0000000000000000 8 +0x0000000040000000 16 +0x0000000060000000 32 + +IADD3: type +0x0001000000000000 X +0x0000002000000000 RS +0x0000004000000000 LS + +IADD3: r8part +0x0000000000000000 H0 +0x0000001000000000 H1 + +IADD3: r20part +0x0000000080000000 H0 + +IADD3: r39part +0x0000000200000000 H0 + +IADD3 +0x0008000000000000 r8neg +0x0004000000000000 r20neg +0x0002000000000000 r39neg + +IADD +0x0000080000000000 X +0x0004000000000000 SAT + +IADD, ISCADD +0x0002000000000000 r8neg +0x0001000000000000 r20neg + +IADD32I +0x0100000000000000 r8neg +0x0020000000000000 X + +DEPBAR: SB +0x0000000000000000 SB0 +0x0000000004000000 SB1 +0x0000000008000000 SB2 +0x000000000c000000 SB3 +0x0000000010000000 SB4 +0x0000000014000000 SB5 + +DEPBAR: cmp +0x0000000020000000 LE + +DEPBAR +0x0000000000000001 db0 +0x0000000000000002 db1 +0x0000000000000004 db2 +0x0000000000000008 db3 +0x0000000000000010 db4 +0x0000000000000020 db5 + +F2F, F2I, I2F, I2I: destWidth +0x0000000000000000 8 +0x0000000000000100 16 +0x0000000000000200 32 +0x0000000000000300 64 + +F2F, F2I, I2F, I2I: srcWidth +0x0000000000000000 8 +0x0000000000000400 16 +0x0000000000000800 32 +0x0000000000000c00 64 + +F2F, F2I, I2F, I2I: destSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000001000 S + +F2F, F2I, I2F, I2I: srcSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000002000 S + +F2I, I2F, I2I: r20part +0x0000000000000000 H0 +0x0000040000000000 H1 +0x0000000000000000 B0 +0x0000020000000000 B1 +0x0000040000000000 B2 +0x0000060000000000 B3 + +F2F: r20part +0x0000000000000000 H0 +0x0000020000000000 H1 + +F2F: round +0x0000040000000000 ROUND +0x0000048000000000 FLOOR +0x0000050000000000 CEIL +0x0000058000000000 TRUNC + +F2I: round +0x0000000000000000 ROUND +0x0000008000000000 FLOOR +0x0000010000000000 CEIL +0x0000018000000000 TRUNC + +HADD2, HMUL2: r8part +0x0001000000000000 H0_H0 +0x0000000000000000 H1_H1 + +HFMA2: r20part +0x0000000020000000 H0_H0 +0x0000000030000000 H1_H1 + +FADD, DADD, FMUL, DMUL, F2F, I2F: rnd +0x0000000000000000 RN +0x0000008000000000 RM +0x0000010000000000 RP +0x0000018000000000 RZ + +DFMA: rnd +0x0000000000000000 RN +0x0004000000000000 RM +0x0008000000000000 RP +0x000c000000000000 RZ + +FFMA: rnd +0x0000000000000000 RN +0x0008000000000000 RM +0x0010000000000000 RP +0x0018000000000000 RZ + +FFMA +0x0020000000000000 FTZ + +F2F, F2I, FADD, FMUL, FMNMX +0x0000100000000000 FTZ + +FADD32I +0x0080000000000000 FTZ + +FMUL32I +0x0020000000000000 FTZ + +FSET +0x0080000000000000 FTZ + +FSETP, FCMP +0x0000800000000000 FTZ + +HADD2, HMUL2 +0x0000008000000000 FTZ + +HFMA2 +0x0000002000000000 FTZ + +FADD, FFMA, FMUL, F2F, I2I +0x0004000000000000 SAT + +FADD, DADD, FMNMX, DMNMX, MUFU +0x0001000000000000 r8neg + +FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I +0x0000200000000000 r20neg + +FMUL, DMUL, FFMA, DFMA +0x0001000000000000 r20neg + +FFMA, DFMA +0x0002000000000000 r39neg + +FADD, DADD, FMNMX, DMNMX +0x0000400000000000 r8abs + +FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I +0x0002000000000000 r20abs + +FSETP, DSETP, FSET, DSET +0x0000080000000000 r8neg +0x0000000000000040 r20neg +0x0000000000000080 r8abs +0x0000100000000000 r20abs + +RRO: func +0x0000000000000000 SINCOS +0x0000008000000000 EX2 + +MUFU: func +0x0000000000000000 COS +0x0000000000100000 SIN +0x0000000000200000 EX2 +0x0000000000300000 LG2 +0x0000000000400000 RCP +0x0000000000500000 RSQ +0x0000000000600000 RCP64H +0x0000000000700000 RSQ64H + +FSETP, DSETP, FSET, DSET, FCMP: cmp +0x0001000000000000 .LT +0x0002000000000000 .EQ +0x0003000000000000 .LE +0x0004000000000000 .GT +0x0004000000000000 +0x0005000000000000 .NE +0x0006000000000000 .GE +0x0007000000000000 .NUM +0x0008000000000000 .NAN +0x0009000000000000 .LTU +0x000a000000000000 .EQU +0x000b000000000000 .LEU +0x000c000000000000 .GTU +0x000d000000000000 .NEU +0x000e000000000000 .GEU + +FSETP, DSETP, FSET, DSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +HSETP2: cmp +0x0000002800000000 .NE + +HSETP2: bool +0x0000000000000000 AND + +S2R: sr +0x0000000000000000 LANEID +0x0000000000200000 VIRTCFG +0x0000000000300000 VIRTID +0x0000000002100000 TID.X +0x0000000002200000 TID.Y +0x0000000002300000 TID.Z +0x0000000002500000 CTAID.X +0x0000000002600000 CTAID.Y +0x0000000002700000 CTAID.Z +0x0000000003800000 EQMASK +0x0000000003900000 LTMASK +0x0000000003a00000 LEMASK +0x0000000003b00000 GTMASK +0x0000000003c00000 GEMASK + +CS2R: sr +0x0000000005000000 CLOCKLO +0x0000000005100000 CLOCKHI +0x0000000005200000 GLOBALTIMERLO +0x0000000005300000 GLOBALTIMERHI + +B2R +0x0000e00000000000 nop45 + +BAR +0x0000100000000000 i8w4 +0x0000080000000000 nor20 +0x0000038000000000 nop39 + +BAR: mode +0x0000000000000000 SYNC +0x0000000100000000 ARV +0x0000000200000000 RED + +BAR: red +0x0000000000000000 POPC +0x0000000800000000 AND +0x0000001000000000 OR + +MEMBAR: mode +0x0000000000000000 CTA +0x0000000000000100 GL +0x0000000000000200 SYS + +VOTE: mode +0x0000000000000000 ALL +0x0001000000000000 ANY +0x0002000000000000 EQ + +VOTE +0x00000000000000ff nor0 + +BRA +0x0000000000000080 U + +TLDS: chnls +0x0010000000000000 RGBA + +TLDS +0x0002000000000000 NODEP + +LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS +0x000000000000ff00 nor8 + +LD, ST: type +0x0000000000000000 .U8 +0x0020000000000000 .S8 +0x0040000000000000 .U16 +0x0060000000000000 .S16 +0x0080000000000000 +0x0080000000000000 .32 +0x00a0000000000000 .64 +0x00c0000000000000 .128 + +LD, ST: cache +0x0100000000000000 CG +0x0200000000000000 CS +0x0300000000000000 CV +0x0300000000000000 WT + +LDG, STG, LDS, STS, LDL, STL, LDC: type +0x0000000000000000 .U8 +0x0001000000000000 .S8 +0x0002000000000000 .U16 +0x0003000000000000 .S16 +0x0004000000000000 +0x0004000000000000 .32 +0x0005000000000000 .64 +0x0006000000000000 .128 + +LDG, STG: cache +0x0000400000000000 CG +0x0000800000000000 CI +0x0000800000000000 CS +0x0000c00000000000 CV +0x0000c00000000000 WT + +LDL: cache +0x0000200000000000 CI + +LDC: cache +0x0000100000000000 IL + +LDG, STG, LDS, STS, LDL, STL, LDC +0x0000200000000000 E + +LDS +0x0000100000000000 U + +RED: type +0x0000000000000000 +0x0000000000100000 .S32 +0x0000000000200000 .U64 +0x0000000000300000 .F32.FTZ.RN +0x0000000000400000 .F16x2.FTZ.RN +0x0000000000500000 .S64 + +RED: mode +0x0000000000000000 ADD +0x0000000000800000 MIN +0x0000000001000000 MAX +0x0000000001800000 INC +0x0000000002000000 DEC +0x0000000002800000 AND +0x0000000003000000 OR +0x0000000003800000 XOR + +ATOM: type +0x0000000000000000 +0x0002000000000000 .S32 +0x0004000000000000 .U64 +0x0006000000000000 .F32.FTZ.RN +0x0008000000000000 .F16x2.FTZ.RN +0x000a000000000000 .S64 +0x0002000000000000 .64 + +ATOM, RED +0x0001000000000000 E + +ATOM: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x03f0000000000000 CAS + +ATOMS: type +0x0000000000000000 +0x0000000010000000 .S32 +0x0000000020000000 .U64 +0x0000000030000000 .S64 +0x0010000000000000 .64 + +ATOMS: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x0240000000000000 CAS +}; + +# The existence of a capture group can map directly to an op code adjustment, or... +# The named capture group value can map the op code adjustmemt from among several options +our %flags; +my (@ops, $flag); +foreach my $line (@flags) +{ + if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)') + { + my $val = hex($1); + # named rules (op: name) + if ($flag) + { $flags{$_}{$flag}{$2} = $val foreach @ops; } + # simple existence check rules + else + { $flags{$_}{$2} = $val foreach @ops; } + } + else + { + my ($ops, $name) = split ':\s*', $line; + @ops = split ',\s*', $ops; + $flag = $name; + } +} + +sub parseInstruct +{ + my ($inst, $grammar) = @_; + return unless $inst =~ $grammar->{rule}; + my %capData = %+; + return \%capData; +} + +# for immediate or constant operands and a given opcode, bits 56-63 get transformed +my %immedOps = map { $_ => 1 } qw(i20 f20 d20); +my %immedCodes = +( + 0x5c => 0x64, + 0x5b => 0x6d, + 0x59 => 0x6b, + 0x58 => 0x68, +); +my %constCodes = +( + c20 => 0x10, + c39 => 0x08, +); +my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4); + +# just pick out the reuse code and nothing else +sub genReuseCode +{ + my $capData = shift; + my $reuse = 0; + $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes; + return $reuse; +} + +# Generate an op code from regex capture data +# if you pass in a test array ref it will populate it with the matching capture groups +sub genCode +{ + my ($op, $grammar, $capData, $test) = @_; + + my $flags = $flags{$op}; + my $code = $grammar->{code}; + my $reuse = 0; + my $immedCode = $immedCodes{$code >> 56}; + + #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I'; + + # process the instruction predicate (if valid for this instuction) + if (exists $capData->{noPred}) + { + delete $capData->{noPred}; + push @$test, 'noPred' if $test; + } + else + { + my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7; + push @$test, 'predNum' if $test; + if (exists $capData->{predNot}) + { + $p |= 8; + push @$test, 'predNot' if $test; + } + $code ^= $p << 16; + delete @{$capData}{qw(predNum predNot)}; + + } + # process the register reuse flags + foreach my $rcode (qw(reuse1 reuse2 reuse3)) + { + if (delete $capData->{$rcode}) + { + $reuse |= $reuseCodes{$rcode}; + push @$test, $rcode if $test; + } + } + + foreach my $capture (keys %$capData) + { + # change the base code for immediate versions of the op + if (exists $immedOps{$capture}) + { $code ^= $immedCode << 56; } + # change the base code for constant versions of the op + elsif (exists $constCodes{$capture}) + { $code ^= $constCodes{$capture} << 56; } + + # if capture group is an operand then process and add that data to code + if (exists $operands{$capture}) + { + # don't process the r20 that comes with the r39s20 capture + unless ($capture eq 'r20' && exists $capData->{r39s20}) + { + $code ^= $operands{$capture}->($capData->{$capture}); + push @$test, $capture if $test; + } + } + + # Add matching flags (an operand might also add/remove a flag) + if (exists $flags->{$capture}) + { + # a named multivalue flag + if (ref $flags->{$capture}) + { + $code ^= $flags->{$capture}{$capData->{$capture}}; + push @$test, "$capture:$capData->{$capture}" if $test; + } + # a simple exists flag + else + { + $code ^= $flags->{$capture}; + push @$test, $capture if $test; + } + } + elsif (!exists $operands{$capture} && !$test) + { + # Every capture group should be acted upon. Missing one is a bug. + warn "UNUSED: $op: $capture: $capData->{$capture}\n"; + warn Dumper($flags); + } + } + + return $code, $reuse; +} + + +my $CtrlRe = qr'(?[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])'; +my $PredRe = qr'(?@!?(?P\d)\s+)'; +my $InstRe = qr"$PredRe?(?\w+)(?[^;]*;)"o; +my $CommRe = qr'(?.*)'; + +sub processAsmLine +{ + my ($line, $lineNum) = @_; + + if ($line =~ m"^$CtrlRe(?\s+)$InstRe$CommRe"o) + { + return { + lineNum => $lineNum, + pred => $+{pred}, + predReg => $+{predReg}, + space => $+{space}, + op => $+{op}, + comment => $+{comment}, + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + ctrl => readCtrl($+{ctrl}, $line), + }; + } + return undef; +} + +sub processSassLine +{ + my $line = shift; + + if ($line =~ m"^\s+/\*(?[0-9a-f]+)\*/\s+$InstRe\s+/\* (?0x[0-9a-f]+)"o) + { + return { + num => hex($+{num}), + pred => $+{pred}, + op => $+{op}, + ins => normalizeSpacing($+{op} . $+{rest}), + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + code => hex($+{code}), + }; + } + return undef; +} + +sub processSassCtrlLine +{ + my ($line, $ctrl, $ruse) = @_; + + return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)'; + + my $code = hex($1); + if (ref $ctrl) + { + push @$ctrl, ($code & 0x000000000001ffff) >> 0; + push @$ctrl, ($code & 0x0000003fffe00000) >> 21; + push @$ctrl, ($code & 0x07fffc0000000000) >> 42; + } + if (ref $ruse) + { + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + } + return 1; +} + +sub replaceXMADs +{ + my $file = shift; + +# XMAD.LO d, a, b, c, x; +# ---------------------- +# XMAD.MRG x, a, b.H1, RZ; +# XMAD d, a, b, c; +# XMAD.PSL.CBCC d, a.H1, x.H1, d; +# ---------------------- +# XMAD d, a, 0xffff, c; +# XMAD.PSL d, a.H1, 0xffff, d; + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD\.LO\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s +%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s; +%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c x comment)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?-?$immed|\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/ + + die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + #TODO: add more XMAD macros + return $file; +} +# convert extra spaces to single spacing to make our re's simplier +sub normalizeSpacing +{ + my $inst = shift; + $inst =~ s/\t/ /g; + $inst =~ s/\s{2,}/ /g; + return $inst; +} + + +# map binary control notation on to easier to work with format. +sub printCtrl +{ + my $code = shift; + + my $stall = ($code & 0x0000f) >> 0; + my $yield = ($code & 0x00010) >> 4; + my $wrtdb = ($code & 0x000e0) >> 5; # write dependency barier + my $readb = ($code & 0x00700) >> 8; # read dependency barier + my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier + + $yield = $yield ? '-' : 'Y'; + $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1; + $readb = $readb == 7 ? '-' : $readb + 1; + $watdb = $watdb ? sprintf('%02x', $watdb) : '--'; + + return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall; +} +sub readCtrl +{ + my ($ctrl, $context) = @_; + my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl; + + $watdb = $watdb eq '--' ? 0 : hex $watdb; + $readb = $readb eq '-' ? 7 : $readb - 1; + $wrtdb = $wrtdb eq '-' ? 7 : $wrtdb - 1; + $yield = $yield eq 'y' || $yield eq 'Y' ? 0 : 1; + $stall = hex $stall; + + die sprintf('wait dep out of range(0x00-0x3f): %x at %s', $watdb, $context) if $watdb != ($watdb & 0x3f); + + return + $watdb << 11 | + $readb << 8 | + $wrtdb << 5 | + $yield << 4 | + $stall << 0; +} + +sub getRegNum +{ + my ($regMap, $regName) = @_; + + return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName}; +} + +sub getVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r0} or return; + + return if $regName eq 'RZ'; + + if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+3); + } + confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4; + return @{$vectors->{$regName}}; + } + return $regName; +} + +sub getAddrVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r8} or return; + + return if $regName eq 'RZ'; + + if (exists $capData->{E}) + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + print Dumper($vectors) unless exists $vectors->{$regName}; + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + return $regName; +} + +__END__ + + + diff --git a/Assembler/PascalAs/blib/lib/PascalAs/.exists b/Assembler/PascalAs/blib/lib/PascalAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/lib/PascalAs/Cubin.pm b/Assembler/PascalAs/blib/lib/PascalAs/Cubin.pm new file mode 100644 index 0000000..10bf9a8 --- /dev/null +++ b/Assembler/PascalAs/blib/lib/PascalAs/Cubin.pm @@ -0,0 +1,686 @@ +package PascalAs::Cubin; + +use strict; +use Data::Dumper; + +my @Elf32_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + L entry + L phOffset + L shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf64_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + Q entry + Q phOffset + Q shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf32_PrgHdr = qw( + L type + L offset + L vaddr + L paddr + L fileSize + L memSize + L flags + L align +); +my @Elf64_PrgHdr = qw( + L type + L flags + Q offset + Q vaddr + Q paddr + Q fileSize + Q memSize + Q align +); +my @Elf32_SecHdr = qw( + L name + L type + L flags + L addr + L offset + L size + L link + L info + L align + L entSize +); +my @Elf64_SecHdr = qw( + L name + L type + Q flags + Q addr + Q offset + Q size + L link + L info + Q align + Q entSize +); +my @Elf32_SymEnt = qw( + L name + L value + L size + C info + C other + S shIndx +); +my @Elf64_SymEnt = qw( + L name + C info + C other + S shIndx + Q value + Q size +); +my @symBind = qw(LOCAL GLOBAL WEAK); + +# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C) +my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC); + +$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr; +$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr; +$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr; +$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt; + +$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr; +$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr; +$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr; +$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt; + +$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr ]; +$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ]; +$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ]; +$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ]; + +$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr ]; +$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ]; +$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ]; +$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ]; + +# Load a cubin ELF file +sub new +{ + my ($package, $file) = @_; + + my $cubin = bless { fileName => $file }, $package; + + open my $fh, $file or die "$file: $!"; + binmode($fh); + + # Read in assuming 32 bit header + my $data; + read $fh, $data, 0x34; + my $elfHdr = $cubin->{elfHdr} = {}; + @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data; + + # 1: 32bit, 2: 64bit + my $class = $elfHdr->{fileClass}; + + # re-read in with 64 bit header if needed + if ($class == 2) + { + seek $fh, 0, 0; + read $fh, $data, 0x46; + @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data; + + $cubin->{Class} = 64; + } + else + { + $cubin->{Class} = 32; + } + + # verify sm_60 cubin + #$cubin->{Arch} = $elfHdr->{flags} & 0xFF; + #die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50; + + $cubin->{Arch} = "60"; + die "Cubin not in sm_60. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} != 60; + $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32; + + # Read in Program Headers + seek $fh, $elfHdr->{phOffset}, 0; + foreach (1 .. $elfHdr->{phNum}) + { + read $fh, $data, $elfHdr->{phEntSize}; + + my %prgHdr = (Indx => $_ - 1); + @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data; + push @{$cubin->{prgHdrs}}, \%prgHdr; + } + + # Read in Section Headers + seek $fh, $elfHdr->{shOffset}, 0; + foreach (1 .. $elfHdr->{shNum}) + { + read $fh, $data, $elfHdr->{shEntSize}; + + my %secHdr = (Indx => $_ - 1); + @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data; + push @{$cubin->{secHdrs}}, \%secHdr; + } + + # Read in Section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $data = ''; + # Skip sections with no data (type NULL or NOBITS) + if ($secHdr->{size} && $secHdr->{type} != 8) + { + seek $fh, $secHdr->{offset}, 0; + read $fh, $data, $secHdr->{size}; + } + # Convert string tables to maps + if ($secHdr->{type} == 3) # STRTAB + { + my $strTab = $secHdr->{StrTab} = {}; + my $indx = 0; + foreach my $str (split "\0", $data) + { + $strTab->{$indx} = $str; + $indx += 1 + length($str); + } + } + # Read in Symbol data + if ($secHdr->{type} == 2) # SYMTAB + { + my $offset = 0; + while ($offset < $secHdr->{size}) + { + my $symEnt = {}; + @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize}); + $offset += $secHdr->{entSize}; + + push @{$secHdr->{SymTab}}, $symEnt; + } + } + # Cache raw data for further processing and writing + $secHdr->{Data} = unpack 'H*', $data; + } + close $fh; + + # Update section headers with their names. Map names directly to headers. + my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab}; + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $secHdr->{Name} = $shStrTab->{$secHdr->{name}}; + $cubin->{$secHdr->{Name}} = $secHdr; + } + + # Update symbols with their names + # For the Global functions, extract kernel meta data + # Populate the kernel hash + my $strTab = $cubin->{'.strtab'}{StrTab}; + foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}}) + { + $symEnt->{Name} = $strTab->{$symEnt->{name}}; + + # Attach symbol to section + my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}]; + $secHdr->{SymbolEnt} = $symEnt; + + # Look for symbols tagged FUNC + if (($symEnt->{info} & 0x0f) == 0x02) + { + # Create a hash of kernels for output + my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr; + + # Extract local/global/weak binding info + $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4]; + + # Extract the kernel instructions + $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ]; + + # Extract the max barrier resource identifier used and add 1. Should be 0-16. + # If a register is used as a barrier resource id, then this value is the max of 16. + $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20; + + # Extract the number of allocated registers for this kernel. + $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24; + + # Extract the size of shared memory this kernel uses. + my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"}; + $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0; + + # Attach constant0 section + $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"}; + + # Extract the kernel parameter data. + my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"}; + if ($paramSec) + { + # Extract raw param data + my @data = unpack "L*", pack "H*", $paramSec->{Data}; + + $paramSec->{ParamData} = \@data; + $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ]; + + # Find the first param delimiter + my $idx = 0; + $idx++ while $idx < @data && $data[$idx] != 0x00080a04; + + my $first = $data[$idx+2] & 0xFFFF; + #my $size = $data[$idx+2] >> 16; + $idx += 4; + + my @params; + while ($idx < @data && $data[$idx] == 0x000c1704) + { + # Get the ordinal, offset, size and pointer alignment for each param + my $ord = $data[$idx+2] & 0xFFFF; + my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16); + my $psize = $data[$idx+3] >> 18; + my $align = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0; + unshift @params, "$ord:$offset:$psize:$align"; + $idx += 4; + } + my @staticParams = @data[0 .. ($idx-1)]; + + my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize); + while ($idx < @data) + { + my $code = $data[$idx] & 0xffff; + my $size = $data[$idx] >> 16; + $idx++; + + # EIATTR_MAXREG_COUNT + if ($code == 0x1b03) + { + $maxregCount = $size; + } + # EIATTR_S2RCTAID_INSTR_OFFSETS + elsif ($code == 0x1d04) + { + while ($size > 0) + { + push @ctaidOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_EXIT_INSTR_OFFSETS + elsif ($code == 0x1c04) + { + while ($size > 0) + { + push @exitOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CTAIDZ_USED + elsif ($code == 0x0401) + { + $ctaidzUsed = 1; + } + # EIATTR_REQNTID + elsif ($code == 0x1004) + { + while ($size > 0) + { + push @reqntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_MAX_THREADS + elsif ($code == 0x0504) + { + while ($size > 0) + { + push @maxntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CRS_STACK_SIZE + elsif ($code == 0x1e04) + { + while ($size > 0) + { + push @stackSize, $data[$idx++]; + $size -= 4; + } + } + else + { + printf "Unknown Code 0x%02x (size:%d)\n", $code, $size; + } + } + $kernelSec->{Params} = \@params; + $kernelSec->{ParamCnt} = scalar @params; + + $paramSec->{StaticParams} = \@staticParams; + $paramSec->{MAXREG_COUNT} = $maxregCount; + $paramSec->{ExitOffsets} = \@exitOffsets; + $paramSec->{CTAIDOffsets} = \@ctaidOffsets; + $paramSec->{CTAIDZUsed} = $ctaidzUsed; + $paramSec->{REQNTID} = \@reqntid; + $paramSec->{MAXNTID} = \@maxntid; + $paramSec->{STACKSIZE} = \@stackSize; + } + # print Dumper($paramSec); + # exit(); + } + # Note GLOBALs found in this cubin + elsif (($symEnt->{info} & 0x10) == 0x10) + { + $cubin->{Symbols}{$symEnt->{Name}} = $symEnt; + } + } + + # print "phOffset: $elfHdr->{phOffset}\n"; + # print "shOffset: $elfHdr->{shOffset}\n"; + # foreach my $secHdr (@{$cubin->{secHdrs}}) + # { + # print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n"; + # } + # my $p = 0; + # foreach my $prgHdr (@{$cubin->{prgHdrs}}) + # { + # print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n"; + # $p++; + # } + # exit(); + + # print Dumper($cubin->{prgHdrs}); + # exit(); + return $cubin; +} +sub class +{ + return shift()->{Class}; +} +sub arch +{ + return shift()->{Arch}; +} +sub address_size +{ + return shift()->{AddressSize}; +} +sub listKernels +{ + return shift()->{Kernels}; +} +sub listSymbols +{ + return shift()->{Symbols}; +} +sub getKernel +{ + my ($cubin, $kernel) = @_; + return $cubin->{Kernels}{$kernel}; +} + +sub modifyKernel +{ + my ($cubin, %params) = @_; + + my $kernelSec = $params{Kernel}; + my $newReg = $params{RegCnt}; + my $newBar = $params{BarCnt}; + my $exitOffsets = $params{ExitOffsets}; + my $ctaidOffsets = $params{CTAIDOffsets}; + my $ctaidzUsed = $params{CTAIDZUsed}; + my $newData = $params{KernelData}; + my $newSize = @$newData * 8; + + die "255 register max" if $newReg > 255; + die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63; + die "16 is max barrier count" if $newBar > 16; + + my $paramSec = $kernelSec->{ParamSec}; + my $kernelName = $kernelSec->{SymbolEnt}{Name}; + my $maxregCount = $paramSec->{MAXREG_COUNT}; + my $stackSize = $paramSec->{STACKSIZE}; + + # update the kernel + $kernelSec->{KernelData} = $newData; + $kernelSec->{Data} = unpack "H*", pack "Q*", @$newData; + + if ($newReg != $kernelSec->{RegCnt}) + { + print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n"; + $kernelSec->{RegCnt} = $newReg; + $kernelSec->{info} &= ~0xff000000; + $kernelSec->{info} |= $newReg << 24; + } + if ($newBar != $kernelSec->{BarCnt}) + { + print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n"; + $kernelSec->{BarCnt} = $newBar; + $kernelSec->{flags} &= ~0x01f00000; + $kernelSec->{flags} |= $newBar << 20; + } + + my @paramData = @{$paramSec->{StaticParams}}; + + if (defined $maxregCount) + { + push @paramData, ($maxregCount << 16) | 0x1b03; + } + + my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets; + my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}}; + + if ($newCTAIDs ne $oldCTAIDs) + { + print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n"; + } + if (@$ctaidOffsets) + { + push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04; + push @paramData, @$ctaidOffsets; + } + + my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets; + my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}}; + + if ($newExits ne $oldExits) + { + print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n"; + } + if (@$exitOffsets) + { + push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04; + push @paramData, @$exitOffsets; + } + + if ($ctaidzUsed != $paramSec->{CTAIDZUsed}) + { + print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n"; + } + if ($ctaidzUsed) + { + push @paramData, 0x0401; + } + + if (@{$paramSec->{REQNTID}}) + { + push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004; + push @paramData, @{$paramSec->{REQNTID}}; + } + if (@{$paramSec->{MAXNTID}}) + { + push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504; + push @paramData, @{$paramSec->{MAXNTID}}; + } + + if (@$stackSize) + { + push @paramData, (scalar(@$stackSize) << 18) | 0x1e04; + push @paramData, @$stackSize; + } + + my $newParamSize = scalar(@paramData)*4; + $paramSec->{Data} = unpack "H*", pack "L*", @paramData; + if ($newParamSize != $paramSec->{size}) + { + print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n"; + $cubin->updateSize($paramSec, $newParamSize); + } + + if ($newSize != $kernelSec->{size}) + { + print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n"; + $cubin->updateSize($kernelSec, $newSize, 1); + } +} + +sub updateSize +{ + my ($cubin, $sec, $newSize, $updatePrgSize) = @_; + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # update section header + my $delta = $newSize - $sec->{size}; + $sec->{size} = $newSize; + + # update symtab section + if ($sec->{SymbolEnt}) + { + $sec->{SymbolEnt}{size} = $newSize; + my $symSection = $cubin->{'.symtab'}; + $symSection->{Data} = ''; + foreach my $symEnt (@{$symSection->{SymTab}}) + { + $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}}; + } + } + + my $pos = $elfHdr->{ehSize}; + my %sizeMap; + + # update section header offsets + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # skip first header + next if $secHdr->{align} == 0; + + # NOBITS data sections are size 0 + my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size}; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pos += $secHdr->{align} - $pad; + } + # map old offset to new + $sizeMap{$secHdr->{offset}} = $pos; + + # update offset + $secHdr->{offset} = $pos; + + # advance position by size + $pos += $size; + } + + # compute total section header size + my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset}; + + # map old offset to new + $sizeMap{$elfHdr->{shOffset}} = $pos; + $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize; + + $elfHdr->{shOffset} = $pos; + $elfHdr->{phOffset} = $pos + $shSize; + + # update program header offsets and sizes + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + # Not sure how best to adjust these so just assume they'll track other offsets. + $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}}; + + # If the kernel sizes changes, also update the associated ProgramHeader. + # Note that this size is the kernel size plus any constant section sizes. + if ($updatePrgSize && $prgHdr->{type} == 1 && + $sec->{offset} >= $prgHdr->{offset} && + $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta) + { + $prgHdr->{fileSize} += $delta; + $prgHdr->{memSize} += $delta; + } + } +} + +# Write out the cubin after modifying it. +sub write +{ + my ($cubin, $file) = @_; + + open my $fh, ">$file" or die "Error: could not open $file for writing: $!"; + binmode($fh); + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # write elf header + print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}}; + my $pos = $elfHdr->{ehSize}; + + # write section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # Skip NULL and NOBITS data sections + next if $secHdr->{size} == 0 || $secHdr->{type} == 8; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pad = $secHdr->{align} - $pad; + print $fh join '', "\0" x $pad; + $pos += $pad; + } + + print $fh pack 'H*', $secHdr->{Data}; + $pos += $secHdr->{size}; + } + + # write section headers + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}}; + } + + #write program headers + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}}; + } + close $fh; +} + +__END__ + diff --git a/Assembler/PascalAs/blib/lib/PascalAs/PascalAs.pm b/Assembler/PascalAs/blib/lib/PascalAs/PascalAs.pm new file mode 100644 index 0000000..eefcdf6 --- /dev/null +++ b/Assembler/PascalAs/blib/lib/PascalAs/PascalAs.pm @@ -0,0 +1,1407 @@ +package PascalAs::PascalAs; + +require 5.10.0; + +use strict; +use Data::Dumper; +use PascalAs::PascalAsGrammar; +use File::Spec; +use Carp; + +our $VERSION = '1.06'; + +# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump +my %relOffset = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT); + +# these ops use absolute addresses +my %absOffset = map { $_ => 1 } qw(JCAL); + +my %jumpOp = (%relOffset, %absOffset); + +# These instructions use r0 but do not write to r0 +my %noDest = map { $_ => 1 } qw(ST STG STS STL RED); + +# Map register slots to reuse control codes +my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4); + +# Preprocess and Assemble a source file +sub Assemble +{ + my ($file, $include, $doReuse, $nowarn) = @_; + + my $regMap = {}; + $file = Preprocess($file, $include, 0, $regMap); + my $vectors = delete $regMap->{__vectors}; + my $regBank = delete $regMap->{__regbank}; + + # initialize cubin counts + my $regCnt = 0; + my $barCnt = 0; + + my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse); + + # initialize the first control instruction + push @instructs, $ctrl = {}; + + foreach my $line (split "\n", $file) + { + # keep track of line nums in the physical file + $lineNum++; + + next unless preProcessLine($line); + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # Save us from crashing the display driver + die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n" + if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0; + + # track branches/jumps/calls/etc for label remapping + push @branches, @instructs+0 if exists $jumpOp{$inst->{op}}; + + # push the control code onto the control instruction + push @{$ctrl->{ctrl}}, $inst->{ctrl}; + + # now point the instruction to its associated control instruction + $inst->{ctrl} = $ctrl; + + # add the op name and full instruction text + push @instructs, $inst; + + # add a 4th control instruction for every 3 instructions + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + # map the label name to the index of the instruction about to be inserted + $labels{$1} = @instructs+0; + } + else + { + die "badly formed line at $lineNum: $line\n"; + } + } + # add the final BRA op and align the number of instructions to a multiple of 8 + push @{$ctrl->{ctrl}}, 0x007ff; + push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' }; + while (@instructs & 7) + { + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + push @{$ctrl->{ctrl}}, 0x007e0; + push @instructs, { op => 'NOP', inst => 'NOP;' }; + } + + # remap labels + foreach my $i (@branches) + { + if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1}) + { die "instruction has invalid label: $instructs[$i]{inst}"; } + + $instructs[$i]{jump} = $labels{$1}; + + if (exists $relOffset{$instructs[$i]{op}}) + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; } + else + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; } + } + + # calculate optimal register reuse + # This effects register bank decisions so do it before analyzing register use + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + if ($doReuse) + { + # get any vector registers for r0 + my @r0 = getVecRegisters($vectors, $capData); + + # There are 2 reuse slots per register slot + # The reuse hash points to most recent instruction index where register was last used in this slot + + # For writes to a register, clear any reuse opportunity + if (@r0 && !exists $noDest{$op}) + { + foreach my $slot (keys %reuseSlots) + { + if (my $reuse = $reuse{$slot}) + { + # if writing with a vector op, clear all linked registers + delete $reuse->{$_} foreach @r0; + } + } + } + # clear cache if jumping elsewhere + %reuse = () if exists $jumpOp{$op}; + + # only track register reuse for instruction types this works with + if ($gram->{type}{reuse}) + { + foreach my $slot (keys %reuseSlots) + { + next unless exists $capData->{$slot}; + + my $r = $capData->{$slot}; + next if $r eq 'RZ'; + next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction + + my $reuse = $reuse{$slot} ||= {}; + + # if this register was previously marked for potential reuse + if (my $p = $reuse->{$r}) + { + # flag the previous instruction's ctrl reuse array slot + $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot}; + + #print "reuse $slot $r $instructs[$p]{inst}\n"; + } + # list full, delete the oldest + elsif (keys %$reuse > 2) + { + my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0]; + delete $reuse->{$oldest}; + } + # mark the new instruction for potential reuse + $reuse->{$r} = $i; + } + } + } + # if reuse is disabled then pull value from code. + elsif ($gram->{type}{reuse}) + { + $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData); + } + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # Assign registers to requested banks if possible + foreach my $r (sort keys %$regBank) + { + my $bank = $regBank->{$r}; + my $avail = $regMap->{$r}; + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + last; + } + } + } + + # calculate register live times and preferred banks for non-fixed registers. + # LiveTime only half implemented... + my (%liveTime, %pairedBanks, %reuseHistory); + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + my $reuseType = $gram->{type}{reuse}; + + # liveTimes and bank conflicts with source operands + my (%addReuse, %delReuse); + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r}; + + # All registers should be written prior to being read.. + if (my $liveTime = $liveTime{$liveR}) + { + # for each read set the current instruction index as the high value + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + + # Is this register active in the reuse cache? + my $slotHist = $reuseHistory{$slot} ||= {}; + my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0; + + #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3'; + + # If this is an auto reg, look at the open banks. + # No need to look at banks if this register is in the reuse cache. + if (!$selfReuse && ref $regMap->{$r}) + { + # Look at other source operands in this instruction and flag what banks are being used + foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39)) + { + my $r2 = $capData->{$slot2}; + next if $r2 eq 'RZ' || $r2 eq $r; + + my $slotHist2 = $reuseHistory{$slot2} ||= {}; + + #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3'; + + # Dont be concerned with non-reuse type instructions or + # If this operand is in the reuse cache, we don't care what bank it's on. + if (!$reuseType || !exists $slotHist2->{$r2}) + { + # if the operand is also an auto-allocated register then link them + # Once we choose the bank for one we want to update that choice for the other register. + if (ref $regMap->{$r2}) + { + push @{$pairedBanks{$r}{pairs}}, $r2; + $pairedBanks{$r}{banks} ||= []; + } + # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid. + else + { + my $bank = substr($regMap->{$r2},1) & 3; + #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3'; + + $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++; + $pairedBanks{$r}{pairs} ||= []; + } + # Update the total use count for this register. + # This will be the number of times the register is pulled out of the bank. + $pairedBanks{$r}{useCnt}++; + } + } + } + # update the reuse history so we know which bank conflicts we can ignore. + if ($reuseType) + { + # flag these slots for addition or removal from reuseHistory + if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot}) + { $addReuse{$slot} = $r; } + else + { $delReuse{$slot} = $r; } + } + } + # update reuse history after we're done with the instruction (when the flag is actually in effect). + # we don't want to updated it in the middle since that can interfere with the checks, + $reuseHistory{$_}{$addReuse{$_}} = 1 foreach keys %addReuse; + delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse; + + # liveTimes for destination operands and vector registers + foreach my $r0 (getVecRegisters($vectors, $capData)) + { + # fixed register mappings can have aliases so use the actual register value for those. + my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0}; + + # If not writing treat just like a read + if (exists $noDest{$op}) + { + if (my $liveTime = $liveTime{$liveR}) + { + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r0): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + } + # If writing, push a new bracket on this register's stack. + elsif (my $liveTime = $liveTime{$liveR}) + { + if ($i > $liveTime->[$#$liveTime][1]) + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + else + { + # Initialize the liveTime stack for this register. + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + #print Dumper(\%liveTime); exit(1); + + # assign unassigned registers + # sort by most restricted, then most used, then name + foreach my $r (sort { + $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} || + $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} || + $a cmp $b + } keys %pairedBanks) + { + my $banks = $pairedBanks{$r}{banks}; + my $avail = $regMap->{$r}; + + #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail; + + # Pick a bank with zero or the smallest number of conflicts + BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3)) + { + # pick an available register that matches the requested bank + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + + # update bank info for any unassigned pair + $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}}; + last BANK; + } + } + } + } + # Now assign any remaining to first available + foreach my $r (sort keys %$regMap) + { + if (ref($regMap->{$r}) eq 'ARRAY') + { + $regMap->{$r} = 'R' . shift @{$regMap->{$r}}; + } + } + #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap; + + # apply the register mapping and assemble the instructions to op codes + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + # save the original and replace the register names with numbers + $instructs[$i]{orig} = $instructs[$i]{inst}; + $instructs[$i]{inst} =~ s/(?{$1}) ? $regMap->{$1} : $1 /ge; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + # update the register count + foreach my $r (qw(r0 r8 r20 r39)) + { + next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ'; + + # get numeric portion of regname + my $val = substr $capData->{$r}, 1; + + my @r0 = getVecRegisters($vectors, $capData); + my @r8 = getAddrVecRegisters($vectors, $capData); + + # smart enough to count vector registers for memory instructions. + my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1; + my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1; + + if ($val + $regInc > $regCnt) + { + $regCnt = $val + $regInc; + #print "$val $regCnt $regInc\n"; + } + } + # update the barrier resource count + if ($op eq 'BAR') + { + if (exists $capData->{i8w4}) + { + $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt; + } + # if a barrier value is a register, assume the maximum + elsif (exists $capData->{r8}) + { + $barCnt = 16; + } + } + # Generate the op code. + my ($code, $reuse) = genCode($op, $gram, $capData); + $instructs[$i]{code} = $code; + + # cache this for final pass when we want to calculate reuse stats. + if ($gram->{type}{reuse}) + { $instructs[$i]{caps} = $capData; } + # use the parsed value of reuse for non-reuse type instructions + else + { $ctrl->{reuse}[($i & 3) - 1] = $reuse; } + + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # final pass to piece together control codes + my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed); + foreach my $i (0 .. $#instructs) + { + # op code + if ($i & 3) + { + push @codes, $instructs[$i]{code}; + + if ($instructs[$i]{caps}) + { + # calculate stats on registers + registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn); + } + if ($instructs[$i]{inst} =~ m'EXIT') + { + push @exitOffsets, (scalar(@codes)-1)*8; + } + elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)') + { + push @ctaidOffsets, (scalar(@codes)-1)*8; + $ctaidzUsed = 1 if $1 eq 'Z'; + } + } + # control code + else + { + my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)}; + push @codes, + ($ctrl->[0] << 0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes + ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59); # reuse codes + } + } + + # return the kernel data + return { + RegCnt => $regCnt, + BarCnt => $barCnt, + ExitOffsets => \@exitOffsets, + CTAIDOffsets => \@ctaidOffsets, + CTAIDZUsed => $ctaidzUsed, + ConflictCnt => $reuseHistory{conflicts}, + ReuseCnt => $reuseHistory{reuse}, + ReuseTot => $reuseHistory{total}, + ReusePct => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0), + KernelData => \@codes, + }; +} + +# Useful for testing op code coverage of existing code, extracting new codes and flags +sub Test +{ + my ($fh, $printConflicts, $all) = @_; + + my @instructs; + my %reuseHistory; + my ($pass, $fail) = (0,0); + + while (my $line = <$fh>) + { + my (@ctrl, @reuse); + + next unless processSassCtrlLine($line, \@ctrl, \@reuse); + + foreach my $fileReuse (@reuse) + { + $line = <$fh>; + + my $inst = processSassLine($line) or next; + + $inst->{reuse} = $fileReuse; + my $fileCode = $inst->{code}; + + if (exists $relOffset{$inst->{op}}) + { + # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump + $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e; + } + + my $match = 0; + foreach my $gram (@{$grammar{$inst->{op}}}) + { + my $capData = parseInstruct($inst->{inst}, $gram) or next; + my @caps; + + # Run in test mode to list what capture groups were captured + my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps); + + # Detect register bank conflicts but only for reuse type instructions. + # If a bank conflict is avoided by a reuse flag then ignore it. + registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse}; + + $inst->{caps} = join ', ', sort @caps; + $inst->{codeDiff} = $fileCode ^ $code; + $inst->{reuseDiff} = $fileReuse ^ $reuse; + + # compare calculated and file values + if ($code == $fileCode && $reuse == $fileReuse) + { + $inst->{grade} = 'PASS'; + push @instructs, $inst if $all; + $pass++; + } + else + { + $inst->{grade} = 'FAIL'; + push @instructs, $inst; + $fail++; + } + $match = 1; + last; + } + unless ($match) + { + $inst->{grade} = 'FAIL'; + $inst->{codeDiff} = $fileCode; + $inst->{reuseDiff} = $fileReuse; + push @instructs, $inst; + $fail++; + } + } + } + my %maxLen; + foreach (@instructs) + { + $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}}; + } + my ($lastOp, $template); + foreach my $inst (sort { + $a->{op} cmp $b->{op} || + $a->{codeDiff} <=> $b->{codeDiff} || + $a->{reuseDiff} <=> $b->{reuseDiff} || + $a->{ins} cmp $b->{ins} + } @instructs) + { + if ($lastOp ne $inst->{op}) + { + $lastOp = $inst->{op}; + $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s %s\n"; + printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures); + } + printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)}; + } + my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0; + + printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n", + $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total}; + + return $fail; +} + +# Convert cuobjdump sass to the working format +sub Extract +{ + my ($in, $out, $params) = @_; + + my %paramMap; + my %constants = + ( + blockDimX => 'c[0x0][0x8]', + blockDimY => 'c[0x0][0xc]', + blockDimZ => 'c[0x0][0x10]', + gridDimX => 'c[0x0][0x14]', + gridDimY => 'c[0x0][0x18]', + gridDimZ => 'c[0x0][0x1c]', + ); + print $out "\n"; + + foreach my $const (sort keys %constants) + { + print $out " $const : $constants{$const}\n"; + $paramMap{$constants{$const}} = $const; + } + print $out "\n"; + + foreach my $p (@$params) + { + my ($ord,$offset,$size,$align) = split ':', $p; + + if ($size > 4) + { + my $num = 0; + $offset = hex $offset; + while ($size > 0) + { + my $param = sprintf 'param_%d[%d]', $ord, $num; + my $const = sprintf 'c[0x0][0x%x]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + $size -= 4; + $offset += 4; + $num += 1; + } + } + else + { + my $param = sprintf 'param_%d', $ord; + my $const = sprintf 'c[0x0][%s]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + } + } + print $out "\n\n"; + + my %labels; + my $labelnum = 1; + + my @data; + FILE: while (my $line = <$in>) + { + my (@ctrl, @ruse); + next unless processSassCtrlLine($line, \@ctrl, \@ruse); + + CTRL: foreach my $ctrl (@ctrl) + { + $line = <$in>; + + my $inst = processSassLine($line) or next CTRL; + + # Convert branch/jump/call addresses to labels + if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)') + { + my $target = hex($1); + + # skip the final BRA and stop processing the file + last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8); + + # check to see if we've already generated a label for this target address + my $label = $labels{$target}; + unless ($label) + { + # generate a label name and cache it + $label = $labels{$target} = "TARGET$labelnum"; + $labelnum++; + } + # replace address with name + $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/; + } + $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg; + + $inst->{ctrl} = printCtrl($ctrl); + + push @data, $inst; + } + } + # make a second pass now that we have the complete instruction address to label mapping + foreach my $inst (@data) + { + print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}}; + printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)}; + } +} + +my $CommentRe = qr'^[\t ]*.*?^\s*\n?'ms; +my $IncludeRe = qr'^[\t ]*\n?'ms; +my $CodeRe = qr'^[\t ]*(.*?)^\s*<\/CODE\1>\n?'ms; +my $ConstMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $RegMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $ScheduleRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $InlineRe = qr'\[(\+|\-)(.+?)\1\]'ms; + +sub IncludeFile +{ + my ($file, $include) = @_; + my ($vol,$dir,$name) = File::Spec->splitpath($file); + local $/; + my $fh; + if (!open $fh, $file) + { + open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n"; + } + my $content = <$fh>; + close $fh; + return $content; +} + +sub Preprocess +{ + my ($file, $include, $debug, $regMap) = @_; + + my $constMap = {}; + my $removeRegMap; + if ($regMap) + { $removeRegMap = 1; } + else + { $regMap = {}; } + + # include nested files + 1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg; + + # Strip out comments + $file =~ s|$CommentRe||g; + + # Execute the CODE sections (old way to run code, to be deprecated) + 1 while $file =~ s|$CodeRe| + my $out = eval "package PascalAs::PascalAs::CODE; $2"; + $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg; + + # Execute the inline code (new way) + $file =~ s|$InlineRe| + my ($type, $code) = ($1, $2); + my $out = eval "package PascalAs::PascalAs::CODE; $code"; + $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg; + + #Pull in the constMap + $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg; + + my @newFile; + foreach my $line (split "\n", $file) + { + # skip comments + if ($line !~ m'^\s*(?:#|//).*') + { + $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg; + } + push @newFile, $line; + } + $file = join "\n", @newFile; + + # Pull in the reg map first as the Scheduler will need it to handle vector instructions + # Remove the regmap if we're going on to assemble + $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg; + + # Pick out the SCHEDULE_BLOCK sections + my @schedBlocks = $file =~ /$ScheduleRe/g; + + # Schedule them + foreach my $i (0 .. $#schedBlocks) + { + # XMAD macros should only appear in SCHEDULE_BLOCKs + $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]); + + $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug); + } + + # Replace the results + $file =~ s|$ScheduleRe| shift @schedBlocks |eg; + + return $file; +} + +# break the registers down into source and destination categories for the scheduler +my %srcReg = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X); +my %destReg = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC); +my %regops = (%srcReg, %destReg); +my @itypes = qw(class lat rlat tput dual); + +sub Scheduler +{ + my ($block, $blockNum, $regMap, $debug) = @_; + + my $vectors = $regMap->{__vectors}; + my $lineNum = 0; + + my (@instructs, @comments, $ordered, $first); + foreach my $line (split "\n", $block) + { + # keep track of line nums in the physical file + $lineNum++; + + unless (preProcessLine($line)) + { + push @comments, $line if $line =~ m'\S'; + next; + } + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # if the first instruction in the block is waiting on a dep, it should go first. + $inst->{first} = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1; + + # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block) + #$inst->{first} = $inst->{ctrl} & 0x0000f ? 1 : 2; + $inst->{exeTime} = 0; + $inst->{order} = $ordered++ if $ordered; + push @instructs, $inst; + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n"; + } + # open an ORDERED block + elsif ($line =~ m'^') + { + die "you cannot use nested tags" if $ordered; + $ordered = 1; + } + # close an ORDERED block + elsif ($line =~ m'^') + { + die "missing opening for closing tag" if !$ordered; + $ordered = 0; + } + else + { + die "badly formed line at block: $blockNum line: $lineNum: $line\n"; + } + } + + my (%writes, %reads, @ready, @schedule, $orderedParent); + # assemble the instructions to op codes + foreach my $instruct (@instructs) + { + my $match = 0; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $capData = parseInstruct($instruct->{inst}, $gram) or next; + my (@dest, @src); + + # copy over instruction types for easier access + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + + # A predicate prefix is treated as a source reg + push @src, $instruct->{predReg} if $instruct->{pred}; + + # Handle P2R and R2P specially + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src; + my $mask = hex($capData->{i20w7}); + foreach my $p (0..6) + { + if ($mask & (1 << $p)) + { + push @$list, "P$p"; + } + # make this instruction dependent on any predicates it's not setting + # this is to prevent a race condition for any predicate sets that are pending + elsif ($instruct->{op} eq 'R2P') + { + push @src, "P$p"; + } + } + # These instructions can't be dual issued + $instruct->{nodual} = 1; + } + + # Populate our register source and destination lists, skipping any zero or true values + foreach my $operand (grep { exists $regops{$_} } sort keys %$capData) + { + # figure out which list to populate + my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src; + + # Filter out RZ and PT + my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT'; + + if ($capData->{$operand} ne $badVal) + { + # add the value to list with the correct prefix + push @$list, + $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) : + $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) : + $operand eq 'CC' ? 'CC' : + $operand eq 'X' ? 'CC' : + getRegNum($regMap, $capData->{$operand}); + } + } + $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39}); + + # Find Read-After-Write dependencies + foreach my $src (grep { exists $writes{$_} } @src) + { + # Memory operations get delayed access to registers but not to the predicate + my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat}; + + # the parent should be the most recently added dest op to the stack + foreach my $parent (@{$writes{$src}}) + { + # add this instruction as a child of the parent + # set the edge to the total latency of reg source availability + #print "R $parent->{inst}\n\t\t$instruct->{inst}\n"; + my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat}; + push @{$parent->{children}}, [$instruct, $latency - $regLatency]; + $instruct->{parents}++; + + # if the destination was conditionally executed, we also need to keep going back till it wasn't + last unless $parent->{pred}; + } + } + + # Find Write-After-Read dependencies + foreach my $dest (grep { exists $reads{$_} } @dest) + { + # Flag this instruction as dependent to any previous read + foreach my $reader (@{$reads{$dest}}) + { + # no need to stall for these types of dependencies + #print "W $reader->{inst} \t\t\t $instruct->{inst}\n"; + push @{$reader->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + # Once dependence is marked we can clear out the read list (unless this write was conditional). + # The assumption here is that you would never want to write out a register without + # subsequently reading it in some way prior to writing it again. + delete $reads{$dest} unless $instruct->{pred}; + } + + # Enforce instruction ordering where requested + if ($instruct->{order}) + { + if ($orderedParent) + { + push @{$orderedParent->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + $orderedParent = $instruct; + } + elsif ($orderedParent) + { $orderedParent = 0; } + + # For a dest reg, push it onto the write stack + unshift @{$writes{$_}}, $instruct foreach @dest; + + # For a src reg, push it into the read list + push @{$reads{$_}}, $instruct foreach @src; + + # if this instruction has no dependencies it's ready to go + push @ready, $instruct if !exists $instruct->{parents}; + + $match = 1; + last; + } + die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match; + } + %writes = (); + %reads = (); + + if (@ready) + { + # update dependent counts for sorting hueristic + my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" }; + + countUniqueDescendants($readyParent, {}); + updateDepCounts($readyParent, {}); + + # sort the initial ready list + @ready = sort { + $a->{first} <=> $b->{first} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + # Process the ready list, adding new instructions to the list as we go. + my $clock = 0; + while (my $instruct = shift @ready) + { + my $stall = $instruct->{stall}; + + # apply the stall to the previous instruction + if (@schedule && $stall < 16) + { + my $prev = $schedule[$#schedule]; + + # if stall is greater than 4 then also yield + # the yield flag is required to get stall counts 12-15 working correctly. + $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0; + $prev->{ctrl} |= $stall; + $clock += $stall; + } + # For stalls bigger than 15 we assume the user is managing it with a barrier + else + { + $instruct->{ctrl} &= 0x1fff0; + $instruct->{ctrl} |= 1; + $clock += 1; + } + print "$clock: $instruct->{inst}\n" if $debug; + + # add a new instruction to the schedule + push @schedule, $instruct; + + # update each child with a new earliest execution time + if (my $children = $instruct->{children}) + { + foreach (@$children) + { + my ($child, $latency) = @$_; + + # update the earliest clock value this child can safely execute + my $earliest = $clock + $latency; + $child->{exeTime} = $earliest if $child->{exeTime} < $earliest; + + print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug; + + # decrement parent count and add to ready queue if none remaining. + push @ready, $child if --$child->{parents} < 1; + } + delete $instruct->{children}; + } + + # update stall and mix values in the ready queue on each iteration + foreach my $ready (@ready) + { + # calculate how many instructions this would cause the just added instruction to stall. + $stall = $ready->{exeTime} - $clock; + $stall = 1 if $stall < 1; + + # if using the same compute resource as the prior instruction then limit the throughput + if ($ready->{class} eq $instruct->{class}) + { + $stall = $ready->{tput} if $stall < $ready->{tput}; + } + # dual issue with a simple instruction (tput <= 2) + # can't dual issue two instructions that both load a constant + elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} && + $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const})) + { + $stall = 0; + } + $ready->{stall} = $stall; + + # add an instruction class mixing huristic that catches anything not handled by the stall + $ready->{mix} = $ready->{class} ne $instruct->{class} || 0; + } + + # sort the ready list by stall time, mixing huristic, dependencies and line number + @ready = sort { + $a->{first} <=> $b->{first} || + $a->{stall} <=> $b->{stall} || + $b->{mix} <=> $a->{mix} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + my $out; + #$out .= "$_\n" foreach @comments; + $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule; + return $out; +} + +sub setConstMap +{ + my ($constMap, $constMapText) = @_; + + foreach my $line (split "\n", $constMapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my ($name, $value) = split '\s*:\s*', $line; + + $constMap->{$name} = $value; + } + return; +} + +sub setRegisterMap +{ + my ($regMap, $regmapText) = @_; + + my $vectors = $regMap->{__vectors} ||= {}; + my $regBank = $regMap->{__regbank} ||= {}; + my %aliases; + + foreach my $line (split "\n", $regmapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my $auto = $line =~ /~/; + my $share = $line =~ /=/; + + my ($regNums, $regNames) = split '\s*[:~=]\s*', $line; + + my (@numList, @nameList, %vecAliases); + foreach my $num (split '\s*,\s*', $regNums) + { + my ($start, $stop) = split '\s*\-\s*', $num; + die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop; + push @numList, ($start .. $stop||$start); + } + foreach my $fullName (split '\s*,\s*', $regNames) + { + if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$') + { + my ($name1, $name2, $bank) = ($1, $3, $4); + foreach (split '\s*\|\s*', $2) + { + my ($start, $stop) = split '\s*\-\s*'; + foreach my $r (map "$name1$_$name2", $start .. $stop||$start) + { + # define an alias for use in vector instructions that omits the number portion + $aliases{$r} = "$name1$name2" unless exists $aliases{$r}; + push @nameList, $r; + $regBank->{$r} = $bank if $auto && defined $bank; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank; + } + } + } + elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$') + { + push @nameList, $1; + $regBank->{$1} = $2 if $auto && defined $2; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2; + } + else + { + die "Bad register name: '$fullName' at: $line\n"; + } + } + die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList; + die "Missmatched register mapping at: $line\n" if $share && @numList > 1; + + # detect if this list is monotonically ascending with no gaps + my $i = 0; + while ($i < $#numList-1) + { + last if $numList[$i] + 1 != $numList[$i+1]; + $i++; + } + my $ascending = $i+1 == $#numList; + + foreach my $n (0..$#nameList) + { + die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]}; + + if ($auto) + { + # assign possible values to be assigned on assembly + $regMap->{$nameList[$n]} = \@numList; + } + elsif ($share) + { + # each name shares the same single register + $regMap->{$nameList[$n]} = 'R' . $numList[0]; + } + else + { + $regMap->{$nameList[$n]} = 'R' . $numList[$n]; + # flag any even register as a potential vector + if ($ascending && ($numList[$n] & 1) == 0) + { + # constrain potential range to vector alignment + my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3); + if ($end <= $#nameList) + { + $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ]; + #setup an alias for the base name without the number + if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}}) + { + $regMap->{$aliases{$nameList[$n]}} = $regMap->{$nameList[$n]}; + $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]}; + delete $aliases{$nameList[$n]}; + } + } + } + } + } + } + #print Dumper($regMap); exit(1); +} + +sub preProcessLine +{ + # strip leading space + $_[0] =~ s|^\s+||; + + # preserve comment but check for emptiness + my $val = shift; + + # strip comments + $val =~ s{(?:#|//).*}{}; + + # skip blank lines + return $val =~ m'\S'; +} + +# traverse the graph and count total descendants per node. +# only count unique nodes (by lineNum) +sub countUniqueDescendants +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + + $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges); + } + } + else + { + return $node->{lineNum}; + } + return ($node->{lineNum}, keys %{$node->{deps}}); +} +# convert hash to count for easier sorting. +sub updateDepCounts +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (@$children) + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + updateDepCounts($child->[0], $edges); + } + } + $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0; +} + +# Detect register bank conflicts and calculate reuse stats +sub registerHealth +{ + my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_; + + my (@banks, @conflicts); + + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $slotHist = $reuseHistory->{$slot} ||= {}; + + $reuseHistory->{total}++; + + # if this register is in active reuse then ignore for bank conflict checking. + if (exists $slotHist->{$r}) + { + $reuseHistory->{reuse}++; + } + else + { + # extract number from reg and take the modulo-4 value. This is the bank id. + my $bank = substr($r,1) & 3; + + # check for conflict + if ($banks[$bank] && $banks[$bank] ne $r) + { + push @conflicts, $banks[$bank] if !@conflicts; + push @conflicts, $r; + + $reuseHistory->{conflicts}++; + } + $banks[$bank] = $r; + } + + # update the history + if ($reuseFlags & $reuseSlots{$slot}) + { $slotHist->{$r} = 1; } + else + { delete $slotHist->{$r}; } + } + if ($inst && @conflicts && !$nowarn) + { + printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts); + } + return scalar @conflicts; +} + +1; + +__END__ + +=head1 NAME + +PascalAs::PascalAs - Assembler for NVIDIA Maxwell architecture + +=head1 SYNOPSIS + + Pascalas.pl [opts] + +=head1 DESCRIPTION + +See the documentation at: https://github.com/NervanaSystems/pascalas + +=head1 SEE ALSO + +See the documentation at: https://github.com/NervanaSystems/pascalas + + +=head1 AUTHOR + +Scott Gray, Esgray@nervanasys.com + +=head1 COPYRIGHT AND LICENSE + +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +=cut diff --git a/Assembler/PascalAs/blib/lib/PascalAs/PascalAsGrammar.pm b/Assembler/PascalAs/blib/lib/PascalAs/PascalAsGrammar.pm new file mode 100644 index 0000000..bf25fb8 --- /dev/null +++ b/Assembler/PascalAs/blib/lib/PascalAs/PascalAsGrammar.pm @@ -0,0 +1,1437 @@ +package PascalAs::PascalAsGrammar; + +use strict; +use Carp; +use Exporter; +use Data::Dumper; +our @ISA = qw(Exporter); + +our @EXPORT = qw( + %grammar %flags + parseInstruct genCode genReuseCode + processAsmLine processSassLine processSassCtrlLine + replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters +); + +require 5.10.0; + +# Helper functions for operands +sub getI +{ + my ($orig, $pos, $mask) = @_; + my $val = $orig; + my $neg = $val =~ s|^\-||; + + # parse out our custom index immediates for addresses + if ($val =~ m'^(\d+)[xX]<([^>]+)>') + { + # allow any perl expression and multiply result by leading decimal. + # also allow global scalar varibles in the expression. + my $mul = $1; + my $exp = $2; + # strip leading zeros (don't interpret numbers as octal) + $exp =~ s/(?> $trunc) & 0x7ffff if $trunc; + } + return $val << $pos; +} +sub getR +{ + my ($val, $pos) = @_; + if ($val =~ m'^R(\d+|Z)$' && $1 < 255) + { + $val = $1 eq 'Z' ? 0xff : $1; + } + else + { + die "Bad register name found: $val\n"; + } + return $val << $pos; +} +sub getP +{ + my ($val, $pos) = @_; + if ($val =~ m'^P(\d|T)$' && $1 < 7) + { + $val = $1 eq 'T' ? 7 : $1; + } + else + { + die "Bad predicate name found: $val\n"; + } + return $val << $pos; +} +sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 } + +# Map operands into their value and position in the op code. +my %operands = +( + p0 => sub { getP($_[0], 0) }, + p3 => sub { getP($_[0], 3) }, + p12 => sub { getP($_[0], 12) }, + p29 => sub { getP($_[0], 29) }, + p39 => sub { getP($_[0], 39) }, + p45 => sub { getP($_[0], 45) }, + p48 => sub { getP($_[0], 48) }, + p58 => sub { getP($_[0], 58) }, + r0 => sub { getR($_[0], 0) }, + r8 => sub { getR($_[0], 8) }, + r20 => sub { getR($_[0], 20) }, + r28 => sub { getR($_[0], 28) }, + r39s20 => sub { getR($_[0], 39) }, + r39 => sub { getR($_[0], 39) }, + r39a => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20 + c20 => sub { getC($_[0]) }, + c39 => sub { getC($_[0]) }, + c34 => sub { hex($_[0]) << 34 }, + c36 => sub { hex($_[0]) << 36 }, + f20w32 => sub { getF($_[0], 20, 'f') }, + f20 => sub { getF($_[0], 20, 'f', 12) }, + d20 => sub { getF($_[0], 20, 'd', 44) }, + i8w4 => sub { getI($_[0], 8, 0xf) }, + i20 => sub { getI($_[0], 20, 0x7ffff) }, + i20w6 => sub { getI($_[0], 20, 0x3f) }, + i20w7 => sub { getI($_[0], 20, 0x7f) }, + i20w8 => sub { getI($_[0], 20, 0xff) }, + i20w12 => sub { getI($_[0], 20, 0xfff) }, + i20w24 => sub { getI($_[0], 20, 0xffffff) }, + i20w32 => sub { getI($_[0], 20, 0xffffffff) }, + i31w4 => sub { getI($_[0], 31, 0xf) }, + i34w13 => sub { getI($_[0], 34, 0x1fff) }, + i36w20 => sub { getI($_[0], 36, 0xfffff) }, + i39w8 => sub { getI($_[0], 39, 0xff) }, + i28w8 => sub { getI($_[0], 28, 0xff) }, + i28w20 => sub { getI($_[0], 28, 0xfffff) }, + i48w8 => sub { getI($_[0], 48, 0xff) }, + i51w5 => sub { getI($_[0], 51, 0x1f) }, + i53w5 => sub { getI($_[0], 53, 0x1f) }, +); + +# Rules for operands and their closely tied flags +my $hex = qr"0[xX][0-9a-fA-F]+"; +my $iAddr = qr"\d+[xX]<[^>]+>"; +my $immed = qr"$hex|$iAddr|\d+"o; +my $reg = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\ +my $p = qr"P[0-6T]"; +my $noPred = qr"(?)"; +my $pred = qr"\@(?\!)?P(?[0-6]) "; +my $p0 = qr"(?$p)"o; +my $p3 = qr"(?$p)"o; +my $p12 = qr"(?\!)?(?$p)"o; +my $p29 = qr"(?\!)?(?$p)"o; +my $p39 = qr"(?\!)?(?$p)"o; +my $p45 = qr"(?$p)"o; +my $p48 = qr"(?$p)"o; +my $p58 = qr"(?$p)"o; +my $r0 = qr"(?$reg)"; +my $r0cc = qr"(?$reg)(?\.CC)?"; +my $r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r28 = qr"(?$reg)"; +my $r39s20 = qr"(?\-)?(?\|)?(?(?$reg))\|?(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39 = qr"(?\-)?(?$reg)(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39a = qr"(?(?$reg))(?\.reuse)?"; +my $c20 = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20x = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20s39 = qr"(?\-)?c\[(?$hex)\]\s*\[(?$hex)\]"o; +my $f20w32 = qr"(?(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))"; +my $f20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $d20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $i8w4 = qr"(?$immed)"o; +my $i20 = qr"(?(?\-)?$immed)(?\.NEG)?"o; +my $i20w6 = qr"(?$immed)"o; +my $i20w7 = qr"(?$immed)"o; +my $i20w8 = qr"(?$immed)"o; +my $i20w12 = qr"(?$immed)"o; +my $i20w24 = qr"(?\-?$immed)"o; +my $i20w32 = qr"(?\-?$immed)"o; +my $i39w8 = qr"(?\-?$immed)"o; +my $i28w8 = qr"(?$immed)"o; +my $i28w20 = qr"(?\-?$immed)"o; +my $i31w4 = qr"(?$immed)"o; +my $i34w13 = qr"(?$immed)"o; +my $i36w20 = qr"(?$immed)"o; +my $i48w8 = qr"(?$immed)"o; +my $i51w5 = qr"(?$immed)"o; +my $i53w5 = qr"(?$immed)"o; +my $ir20 = qr"$i20|$r20"o; +my $cr20 = qr"$c20|$r20"o; +my $icr20 = qr"$i20|$c20|$r20"o; +my $fcr20 = qr"$f20|$c20|$r20"o; +my $cr39 = qr"$c20s39|$r39"o; +my $dr20 = qr"$d20|$r20"o; + +# Instruction specific rules for capturing various flags +my $u32 = qr"(?\.U32)?"; +my $ftz = qr"(?\.FTZ)?"; +my $sat = qr"(?\.SAT)?"; +my $rnd = qr"(?:\.(?RN|RM|RP|RZ))?"; +my $round = qr"(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?"; +my $fcmp = qr"(?\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)"; +my $icmp = qr"\.(?LT|EQ|LE|GT|NE|GE)"; +my $bool = qr"\.(?AND|OR|XOR|PASS_B)"; +my $bool2 = qr"\.(?AND|OR|XOR)"; +my $func = qr"\.(?COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)"; +my $rro = qr"\.(?SINCOS|EX2)"; +my $add3 = qr"(?:\.(?X|RS|LS))?"; +my $lopz = qr"(?:\.(?NZ|Z) $p48,|(?))"o; +my $X = qr"(?\.X)?"; +my $tld = qr"(?NODEP\.)?(?:(?T)|(?P))"; +my $chnls = qr"(?R|RGBA)"; +my $sr = qr"SR_(?\S+)"; +my $shf = qr"(?\.W)?(?:\.(?U64|S64))?(?\.HI)?"; +my $xmad = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $xmadc = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $vmad8 = qr"\.(?[SU])(?8|16)\.(?[SU])(?8|16)(?\.PO)?(?\.SHR_7)?(?\.SHR_15)?(?\.SAT)?"; +my $vmad16= qr"\.(?[SU])(?16)\.(?[SU])(?16)"; +my $hilo = qr"(?:\.(?XHI|XLO))?"; +my $vaddType = qr"(?:\.(?UD))?(?:\.(?SD))?(?:\.(?[SU])(?8|16|32))?(?:\.(?[SU])(?8|16|32))?"; +my $vaddMode = qr"(?:\.(?MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?"; +my $vmnmx = qr"(?:\.(?MX))?"; +my $x2x = qr"\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)"; +my $prmt = qr"(?:\.(?F4E|B4E|RC8|ECL|ECR|RC16))?"; +my $shfl = qr"\.(?IDX|UP|DOWN|BFLY)"; +my $bar = qr"\.(?SYNC|ARV|RED)(?:\.(?POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?()|(?))(?(), $p39|(?))"o; +my $b2r = qr"\.RESULT $r0(?:, $p45|(?))"o; +my $dbar = qr"(?SB0|SB1|SB2|SB3|SB4|SB5)"; +my $dbar2 = qr" {(?5)?,?(?4)?,?(?3)?,?(?2)?,?(?1)?,?(?0)?}"; +my $mbar = qr"\.(?CTA|GL|SYS)"; +my $addr = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i20w24)?\]"o; +my $addr2 = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i28w20)?\]"o; +my $ldc = qr"c\[(?$hex)\]\s*$addr"o; +my $atom = qr"(?\.E)?(?:\.(?ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)"; +my $vote = qr"\.(?ALL|ANY|EQ)"o; +my $memType = qr"(?\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)"; +my $memCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CI|CS|CV|IL|WT))?"; + + + +# class: hardware resource that shares characteristics with types +# lat : pipeline depth where relevent, placeholder for memory ops +# blat : barrier latency, typical fetch time for memory operations. Highly variable. +# rlat : operand read latency for memory ops +# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op. +# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession. +# dual : whether this instruction type can be dual issued +# reuse: whether this instruction type accepts register reuse flags. + +# Some of these values are guesses and need to be updated from micro benchmarks. +# We may need to split these classes up further. +my $s2rT = {class => 's2r', lat => 2, blat => 25, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $smemT = {class => 'mem', lat => 2, blat => 30, rlat => 2, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $gmemT = {class => 'mem', lat => 2, blat => 200, rlat => 4, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $x32T = {class => 'x32', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 1}; +my $x64T = {class => 'x64', lat => 2, blat => 128, rlat => 0, rhold => 0, tput => 128, dual => 0, reuse => 1}; +my $shftT = {class => 'shift', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $cmpT = {class => 'cmp', lat => 13, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $qtrT = {class => 'qtr', lat => 8, blat => 0, rlat => 4, rhold => 0, tput => 1, dual => 1, reuse => 0}; +my $rroT = {class => 'rro', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $voteT = {class => 'vote', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; + + +# Create map of op names to rules +our %grammar = +( + #Floating Point Instructions + FADD => [ { type => $x32T, code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FADD32I => [ { type => $x32T, code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o, } ], + FCHK => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o, } ], #Partial? + FCMP => [ { type => $cmpT, code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o, } ], + FFMA => [ + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o, }, + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o, }, + ], + FMNMX => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o, } ], + FMUL => [ { type => $x32T, code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FMUL32I => [ { type => $x32T, code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o, } ], + FSET => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o, } ], + FSETP => [ { type => $cmpT, code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], + MUFU => [ { type => $qtrT, code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o, } ], + RRO => [ { type => $rroT, code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o, } ], + DADD => [ { type => $x64T, code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o, } ], + DFMA => [ { type => $x64T, code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o, } ], + DMNMX => [ { type => $cmpT, code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o, } ], + DMUL => [ { type => $x64T, code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o, } ], + DSET => [ { type => $cmpT, code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o, } ], + DSETP => [ { type => $cmpT, code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o, } ], + FSWZADD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o, } ], #TODO + + HADD2 => [ { type => $x32T, code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o, } ], + HMUL2 => [ { type => $x32T, code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o, } ], + HFMA2 => [ { type => $x32T, code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o, } ], + HSETP2 => [ { type => $cmpT, code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial + + #Integer Instructions + BFE => [ { type => $shftT, code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o, } ], + BFI => [ { type => $shftT, code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o, } ], + FLO => [ { type => $s2rT, code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o, } ], + IADD => [ { type => $x32T, code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o, } ], + IADD32I => [ { type => $x32T, code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o, } ], + IADD3 => [ { type => $x32T, code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o, } ], + ICMP => [ { type => $cmpT, code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o, } ], + IMNMX => [ { type => $shftT, code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o, } ], + ISET => [ { type => $shftT, code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o, } ], + ISETP => [ { type => $cmpT, code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ], + ISCADD => [ { type => $shftT, code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o, } ], + ISCADD32I => [ { type => $shftT, code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o, } ], + LEA => [ + { type => $cmpT, code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o, }, + { type => $shftT, code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o, }, + { type => $shftT, code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o, }, + { type => $shftT, code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o, }, + ], + LOP => [ { type => $x32T, code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?~)?$icr20(?\.INV)?;"o, } ], + LOP32I => [ { type => $x32T, code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o, } ], + LOP3 => [ + { type => $x32T, code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o, }, + { type => $x32T, code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o, }, + ], + POPC => [ { type => $s2rT, code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o, } ], + SHF => [ + { type => $shftT, code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o, }, + { type => $shftT, code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o, }, + ], + SHL => [ { type => $shftT, code => 0x5c48000000000000, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $icr20;"o, } ], + SHR => [ { type => $shftT, code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o, } ], + XMAD => [ + { type => $x32T, code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o, }, + { type => $x32T, code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o, }, + ], + # XMAD replaces these + IMAD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o, } ], #TODO + IMADSP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO + IMUL => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o, } ], #TODO + + #Conversion Instructions + F2F => [ { type => $qtrT, code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ], + F2I => [ { type => $qtrT, code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o, } ], + I2F => [ { type => $qtrT, code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o, } ], + I2I => [ { type => $qtrT, code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o, } ], + + #Movement Instructions + MOV => [ { type => $x32T, code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o, } ], + MOV32I => [ { type => $x32T, code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o, } ], + PRMT => [ { type => $x32T, code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ], + SEL => [ { type => $x32T, code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o, } ], + SHFL => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ], + + #Predicate/CC Instructions + PSET => [ { type => $cmpT, code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o, } ], + PSETP => [ { type => $cmpT, code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ], + CSET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o, } ], #TODO + CSETP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO + P2R => [ { type => $x32T, code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o, } ], + R2P => [ { type => $cmpT, code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o, } ], + + #Texture Instructions + # Handle the commonly used 1D texture functions.. but save the others for later + TLD => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial + TLDS => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial + TEX => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o, } ], #TODO + TLD4 => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o, } ], #TODO + TXQ => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o, } ], #TODO + TEXS => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o, } ], #TODO + TLD4S => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO + + #Compute Load/Store Instructions + LD => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o, } ], + ST => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o, } ], + LDG => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o, } ], + STG => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o, } ], + LDS => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o, } ], + STS => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o, } ], + LDL => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o, } ], + STL => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o, } ], + LDC => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o, } ], + # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded). + ATOM => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + ATOMS => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + RED => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o, } ], + CCTL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o, } ], #TODO + CCTLL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO + CCTLT => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO + + #Surface Memory Instructions (haven't gotten to these yet..) + SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO + SULD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o, } ], #TODO + SURED => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o, } ], #TODO + SUST => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o, } ], #TODO + + #Control Instructions + BRA => [ + { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?\.U)? $i20w24;"o, }, + { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?\.U)? CC\.EQ, $i20w24;"o, }, + ], + BRX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o, } ], #TODO + JMP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o, } ], #TODO + JMX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o, } ], #TODO + SSY => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o, } ], + SYNC => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o, } ], + CAL => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o, } ], + JCAL => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o, } ], + PRET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o, } ], #TODO + RET => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o, } ], + BRK => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o, } ], + PBK => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o, } ], + CONT => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o, } ], + PCNT => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o, } ], + EXIT => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o, } ], + PEXIT => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o, } ], #TODO + BPT => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o, } ], + + #Miscellaneous Instructions + NOP => [ { type => $x32T, code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o, } ], + CS2R => [ { type => $x32T, code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o, } ], + S2R => [ { type => $s2rT, code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o, } ], + B2R => [ { type => $x32T, code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o, } ], + BAR => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o, } ], + DEPBAR => [ + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, }, + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o, }, + ], + MEMBAR => [ { type => $x32T, code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o, } ], + VOTE => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?))$p45, $p39;"o, } ], + R2B => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o, } ], #TODO + + #Video Instructions... Need to finish + VADD => [ { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMAD => [ + { type => $x32T, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, }, + ], + VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMNMX => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + + VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 +); + +# Create map of capture groups to op code flags that need to be added (or removed) +my @flags = grep /\S/, split "\n", q{; + +BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD +0x0100000000000000 neg + +FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP +0x0100000000000000 neg + +PSET, PSETP +0x0000000000008000 p12not +0x0000000100000000 p29not + +FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE +0x0000040000000000 p39not + +IADD, IADD3, XMAD, LEA, IMNMX +0x0000800000000000 CC + +IADD32I +0x0010000000000000 CC + +LEA +0x0000000000000000 X + +SHF +0x0004000000000000 W +0x0001000000000000 HI + +SHF: type +0x0000004000000000 U64 +0x0000006000000000 S64 + +SHR, IMNMX, ISETP, ISET, ICMP, BFE +0x0001000000000000 U32 + +SHL +0x0000008000000000 W + +SHFL +0x0000000010000000 i20w8 +0x0000000020000000 i34w13 + +SHFL: mode +0x0000000000000000 IDX +0x0000000040000000 UP +0x0000000080000000 DOWN +0x00000000c0000000 BFLY + +IMNMX: mode +0x0000080000000000 XLO +0x0000180000000000 XHI + +ISETP, ISET, ICMP: cmp +0x0002000000000000 LT +0x0004000000000000 EQ +0x0006000000000000 LE +0x0008000000000000 GT +0x000a000000000000 NE +0x000c000000000000 GE + +ISETP, ISET, PSETP, PSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +PSETP, PSET: bool2 +0x0000000000000000 AND +0x0000000001000000 OR +0x0000000002000000 XOR + +ISETP, ISET +0x0000080000000000 X + +LOP: bool +0x0000000000000000 AND +0x0000020000000000 OR +0x0000040000000000 XOR +0x0000060000000000 PASS_B + +LOP: +0x0000010000000000 INV + +LOP: z +0x0000200000000000 Z +0x0000300000000000 NZ + +LOP +0x0007000000000000 noz + +LOP32I: bool +0x0000000000000000 AND +0x0020000000000000 OR +0x0040000000000000 XOR + +PRMT: mode +0x0001000000000000 F4E +0x0002000000000000 B4E +0x0003000000000000 RC8 +0x0004000000000000 ECL +0x0005000000000000 ECR +0x0006000000000000 RC16 + +XMAD: type1 +0x0000000000000000 U16 +0x0001000000000000 S16 + +XMAD: type2 +0x0000000000000000 U16 +0x0002000000000000 S16 + +XMAD: mode +0x0000002000000000 MRG +0x0000001000000000 PSL +0x0008000000000000 CHI +0x0004000000000000 CLO +0x000c000000000000 CSFU + +XMAD: modec +0x0004000000000000 CLO +0x0008000000000000 CHI +0x000c000000000000 CSFU +0x0040000000000000 X +0x0080000000000000 PSL +0x0100000000000000 MRG + +XMAD +0x0010000000000000 CBCC + +XMAD: r8part +0x0000000000000000 H0 +0x0020000000000000 H1 + +XMAD: r20part +0x0000000000000000 H0 +0x0000000800000000 H1 + +XMAD: r20partx +0x0000000000000000 H0 +0x0010000000000000 H1 + +XMAD: r39part +0x0000000000000000 H0 +0x0010000000000000 H1 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part +0x0000000000000000 B0 +0x0000001000000000 B1 +0x0000002000000000 B2 +0x0000003000000000 B3 +0x0000001000000000 H1 +0x0000000000000000 H0 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part +0x0000000000000000 B0 +0x0000000010000000 B1 +0x0000000020000000 B2 +0x0000000030000000 B3 +0x0000000010000000 H1 +0x0000000000000000 H0 + +VMAD +0x0040000000000000 r8neg +0x0020000000000000 r39neg +0x0008000000000000 SHR_7 +0x0010000000000000 SHR_15 +0x0060000000000000 PO +0x0080000000000000 SAT + +VMNMX +0x0100000000000000 MX + +VADD, VABSDIFF, VMNMX +0x0080000000000000 SAT +0x0040000000000000 UD +0x0040000000000000 SD + +VSET: cmp +0x0040000000000000 LT +0x0080000000000000 EQ +0x00c0000000000000 LE +0x0100000000000000 GT +0x0140000000000000 NE +0x0180000000000000 GE + +VADD, VSET: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VABSDIFF: mode +0x0003000000000000 ACC +0x000b000000000000 MIN +0x0013000000000000 MAX +0x0023000000000000 MRG_16H +0x002b000000000000 MRG_16L +0x0033000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x003b000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMNMX: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1 +0x0000000000000000 U +0x0001000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2 +0x0000000000000000 U +0x0002000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size1 +0x0000000000000000 8 +0x0000004000000000 16 +0x0000006000000000 32 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size2 +0x0000000000000000 8 +0x0000000040000000 16 +0x0000000060000000 32 + +IADD3: type +0x0001000000000000 X +0x0000002000000000 RS +0x0000004000000000 LS + +IADD3: r8part +0x0000000000000000 H0 +0x0000001000000000 H1 + +IADD3: r20part +0x0000000080000000 H0 + +IADD3: r39part +0x0000000200000000 H0 + +IADD3 +0x0008000000000000 r8neg +0x0004000000000000 r20neg +0x0002000000000000 r39neg + +IADD +0x0000080000000000 X +0x0004000000000000 SAT + +IADD, ISCADD +0x0002000000000000 r8neg +0x0001000000000000 r20neg + +IADD32I +0x0100000000000000 r8neg +0x0020000000000000 X + +DEPBAR: SB +0x0000000000000000 SB0 +0x0000000004000000 SB1 +0x0000000008000000 SB2 +0x000000000c000000 SB3 +0x0000000010000000 SB4 +0x0000000014000000 SB5 + +DEPBAR: cmp +0x0000000020000000 LE + +DEPBAR +0x0000000000000001 db0 +0x0000000000000002 db1 +0x0000000000000004 db2 +0x0000000000000008 db3 +0x0000000000000010 db4 +0x0000000000000020 db5 + +F2F, F2I, I2F, I2I: destWidth +0x0000000000000000 8 +0x0000000000000100 16 +0x0000000000000200 32 +0x0000000000000300 64 + +F2F, F2I, I2F, I2I: srcWidth +0x0000000000000000 8 +0x0000000000000400 16 +0x0000000000000800 32 +0x0000000000000c00 64 + +F2F, F2I, I2F, I2I: destSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000001000 S + +F2F, F2I, I2F, I2I: srcSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000002000 S + +F2I, I2F, I2I: r20part +0x0000000000000000 H0 +0x0000040000000000 H1 +0x0000000000000000 B0 +0x0000020000000000 B1 +0x0000040000000000 B2 +0x0000060000000000 B3 + +F2F: r20part +0x0000000000000000 H0 +0x0000020000000000 H1 + +F2F: round +0x0000040000000000 ROUND +0x0000048000000000 FLOOR +0x0000050000000000 CEIL +0x0000058000000000 TRUNC + +F2I: round +0x0000000000000000 ROUND +0x0000008000000000 FLOOR +0x0000010000000000 CEIL +0x0000018000000000 TRUNC + +HADD2, HMUL2: r8part +0x0001000000000000 H0_H0 +0x0000000000000000 H1_H1 + +HFMA2: r20part +0x0000000020000000 H0_H0 +0x0000000030000000 H1_H1 + +FADD, DADD, FMUL, DMUL, F2F, I2F: rnd +0x0000000000000000 RN +0x0000008000000000 RM +0x0000010000000000 RP +0x0000018000000000 RZ + +DFMA: rnd +0x0000000000000000 RN +0x0004000000000000 RM +0x0008000000000000 RP +0x000c000000000000 RZ + +FFMA: rnd +0x0000000000000000 RN +0x0008000000000000 RM +0x0010000000000000 RP +0x0018000000000000 RZ + +FFMA +0x0020000000000000 FTZ + +F2F, F2I, FADD, FMUL, FMNMX +0x0000100000000000 FTZ + +FADD32I +0x0080000000000000 FTZ + +FMUL32I +0x0020000000000000 FTZ + +FSET +0x0080000000000000 FTZ + +FSETP, FCMP +0x0000800000000000 FTZ + +HADD2, HMUL2 +0x0000008000000000 FTZ + +HFMA2 +0x0000002000000000 FTZ + +FADD, FFMA, FMUL, F2F, I2I +0x0004000000000000 SAT + +FADD, DADD, FMNMX, DMNMX, MUFU +0x0001000000000000 r8neg + +FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I +0x0000200000000000 r20neg + +FMUL, DMUL, FFMA, DFMA +0x0001000000000000 r20neg + +FFMA, DFMA +0x0002000000000000 r39neg + +FADD, DADD, FMNMX, DMNMX +0x0000400000000000 r8abs + +FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I +0x0002000000000000 r20abs + +FSETP, DSETP, FSET, DSET +0x0000080000000000 r8neg +0x0000000000000040 r20neg +0x0000000000000080 r8abs +0x0000100000000000 r20abs + +RRO: func +0x0000000000000000 SINCOS +0x0000008000000000 EX2 + +MUFU: func +0x0000000000000000 COS +0x0000000000100000 SIN +0x0000000000200000 EX2 +0x0000000000300000 LG2 +0x0000000000400000 RCP +0x0000000000500000 RSQ +0x0000000000600000 RCP64H +0x0000000000700000 RSQ64H + +FSETP, DSETP, FSET, DSET, FCMP: cmp +0x0001000000000000 .LT +0x0002000000000000 .EQ +0x0003000000000000 .LE +0x0004000000000000 .GT +0x0004000000000000 +0x0005000000000000 .NE +0x0006000000000000 .GE +0x0007000000000000 .NUM +0x0008000000000000 .NAN +0x0009000000000000 .LTU +0x000a000000000000 .EQU +0x000b000000000000 .LEU +0x000c000000000000 .GTU +0x000d000000000000 .NEU +0x000e000000000000 .GEU + +FSETP, DSETP, FSET, DSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +HSETP2: cmp +0x0000002800000000 .NE + +HSETP2: bool +0x0000000000000000 AND + +S2R: sr +0x0000000000000000 LANEID +0x0000000000200000 VIRTCFG +0x0000000000300000 VIRTID +0x0000000002100000 TID.X +0x0000000002200000 TID.Y +0x0000000002300000 TID.Z +0x0000000002500000 CTAID.X +0x0000000002600000 CTAID.Y +0x0000000002700000 CTAID.Z +0x0000000003800000 EQMASK +0x0000000003900000 LTMASK +0x0000000003a00000 LEMASK +0x0000000003b00000 GTMASK +0x0000000003c00000 GEMASK + +CS2R: sr +0x0000000005000000 CLOCKLO +0x0000000005100000 CLOCKHI +0x0000000005200000 GLOBALTIMERLO +0x0000000005300000 GLOBALTIMERHI + +B2R +0x0000e00000000000 nop45 + +BAR +0x0000100000000000 i8w4 +0x0000080000000000 nor20 +0x0000038000000000 nop39 + +BAR: mode +0x0000000000000000 SYNC +0x0000000100000000 ARV +0x0000000200000000 RED + +BAR: red +0x0000000000000000 POPC +0x0000000800000000 AND +0x0000001000000000 OR + +MEMBAR: mode +0x0000000000000000 CTA +0x0000000000000100 GL +0x0000000000000200 SYS + +VOTE: mode +0x0000000000000000 ALL +0x0001000000000000 ANY +0x0002000000000000 EQ + +VOTE +0x00000000000000ff nor0 + +BRA +0x0000000000000080 U + +TLDS: chnls +0x0010000000000000 RGBA + +TLDS +0x0002000000000000 NODEP + +LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS +0x000000000000ff00 nor8 + +LD, ST: type +0x0000000000000000 .U8 +0x0020000000000000 .S8 +0x0040000000000000 .U16 +0x0060000000000000 .S16 +0x0080000000000000 +0x0080000000000000 .32 +0x00a0000000000000 .64 +0x00c0000000000000 .128 + +LD, ST: cache +0x0100000000000000 CG +0x0200000000000000 CS +0x0300000000000000 CV +0x0300000000000000 WT + +LDG, STG, LDS, STS, LDL, STL, LDC: type +0x0000000000000000 .U8 +0x0001000000000000 .S8 +0x0002000000000000 .U16 +0x0003000000000000 .S16 +0x0004000000000000 +0x0004000000000000 .32 +0x0005000000000000 .64 +0x0006000000000000 .128 + +LDG, STG: cache +0x0000400000000000 CG +0x0000800000000000 CI +0x0000800000000000 CS +0x0000c00000000000 CV +0x0000c00000000000 WT + +LDL: cache +0x0000200000000000 CI + +LDC: cache +0x0000100000000000 IL + +LDG, STG, LDS, STS, LDL, STL, LDC +0x0000200000000000 E + +LDS +0x0000100000000000 U + +RED: type +0x0000000000000000 +0x0000000000100000 .S32 +0x0000000000200000 .U64 +0x0000000000300000 .F32.FTZ.RN +0x0000000000400000 .F16x2.FTZ.RN +0x0000000000500000 .S64 + +RED: mode +0x0000000000000000 ADD +0x0000000000800000 MIN +0x0000000001000000 MAX +0x0000000001800000 INC +0x0000000002000000 DEC +0x0000000002800000 AND +0x0000000003000000 OR +0x0000000003800000 XOR + +ATOM: type +0x0000000000000000 +0x0002000000000000 .S32 +0x0004000000000000 .U64 +0x0006000000000000 .F32.FTZ.RN +0x0008000000000000 .F16x2.FTZ.RN +0x000a000000000000 .S64 +0x0002000000000000 .64 + +ATOM, RED +0x0001000000000000 E + +ATOM: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x03f0000000000000 CAS + +ATOMS: type +0x0000000000000000 +0x0000000010000000 .S32 +0x0000000020000000 .U64 +0x0000000030000000 .S64 +0x0010000000000000 .64 + +ATOMS: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x0240000000000000 CAS +}; + +# The existence of a capture group can map directly to an op code adjustment, or... +# The named capture group value can map the op code adjustmemt from among several options +our %flags; +my (@ops, $flag); +foreach my $line (@flags) +{ + if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)') + { + my $val = hex($1); + # named rules (op: name) + if ($flag) + { $flags{$_}{$flag}{$2} = $val foreach @ops; } + # simple existence check rules + else + { $flags{$_}{$2} = $val foreach @ops; } + } + else + { + my ($ops, $name) = split ':\s*', $line; + @ops = split ',\s*', $ops; + $flag = $name; + } +} + +sub parseInstruct +{ + my ($inst, $grammar) = @_; + return unless $inst =~ $grammar->{rule}; + my %capData = %+; + return \%capData; +} + +# for immediate or constant operands and a given opcode, bits 56-63 get transformed +my %immedOps = map { $_ => 1 } qw(i20 f20 d20); +my %immedCodes = +( + 0x5c => 0x64, + 0x5b => 0x6d, + 0x59 => 0x6b, + 0x58 => 0x68, +); +my %constCodes = +( + c20 => 0x10, + c39 => 0x08, +); +my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4); + +# just pick out the reuse code and nothing else +sub genReuseCode +{ + my $capData = shift; + my $reuse = 0; + $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes; + return $reuse; +} + +# Generate an op code from regex capture data +# if you pass in a test array ref it will populate it with the matching capture groups +sub genCode +{ + my ($op, $grammar, $capData, $test) = @_; + + my $flags = $flags{$op}; + my $code = $grammar->{code}; + my $reuse = 0; + my $immedCode = $immedCodes{$code >> 56}; + + #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I'; + + # process the instruction predicate (if valid for this instuction) + if (exists $capData->{noPred}) + { + delete $capData->{noPred}; + push @$test, 'noPred' if $test; + } + else + { + my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7; + push @$test, 'predNum' if $test; + if (exists $capData->{predNot}) + { + $p |= 8; + push @$test, 'predNot' if $test; + } + $code ^= $p << 16; + delete @{$capData}{qw(predNum predNot)}; + + } + # process the register reuse flags + foreach my $rcode (qw(reuse1 reuse2 reuse3)) + { + if (delete $capData->{$rcode}) + { + $reuse |= $reuseCodes{$rcode}; + push @$test, $rcode if $test; + } + } + + foreach my $capture (keys %$capData) + { + # change the base code for immediate versions of the op + if (exists $immedOps{$capture}) + { $code ^= $immedCode << 56; } + # change the base code for constant versions of the op + elsif (exists $constCodes{$capture}) + { $code ^= $constCodes{$capture} << 56; } + + # if capture group is an operand then process and add that data to code + if (exists $operands{$capture}) + { + # don't process the r20 that comes with the r39s20 capture + unless ($capture eq 'r20' && exists $capData->{r39s20}) + { + $code ^= $operands{$capture}->($capData->{$capture}); + push @$test, $capture if $test; + } + } + + # Add matching flags (an operand might also add/remove a flag) + if (exists $flags->{$capture}) + { + # a named multivalue flag + if (ref $flags->{$capture}) + { + $code ^= $flags->{$capture}{$capData->{$capture}}; + push @$test, "$capture:$capData->{$capture}" if $test; + } + # a simple exists flag + else + { + $code ^= $flags->{$capture}; + push @$test, $capture if $test; + } + } + elsif (!exists $operands{$capture} && !$test) + { + # Every capture group should be acted upon. Missing one is a bug. + warn "UNUSED: $op: $capture: $capData->{$capture}\n"; + warn Dumper($flags); + } + } + + return $code, $reuse; +} + + +my $CtrlRe = qr'(?[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])'; +my $PredRe = qr'(?@!?(?P\d)\s+)'; +my $InstRe = qr"$PredRe?(?\w+)(?[^;]*;)"o; +my $CommRe = qr'(?.*)'; + +sub processAsmLine +{ + my ($line, $lineNum) = @_; + + if ($line =~ m"^$CtrlRe(?\s+)$InstRe$CommRe"o) + { + return { + lineNum => $lineNum, + pred => $+{pred}, + predReg => $+{predReg}, + space => $+{space}, + op => $+{op}, + comment => $+{comment}, + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + ctrl => readCtrl($+{ctrl}, $line), + }; + } + return undef; +} + +sub processSassLine +{ + my $line = shift; + + if ($line =~ m"^\s+/\*(?[0-9a-f]+)\*/\s+$InstRe\s+/\* (?0x[0-9a-f]+)"o) + { + return { + num => hex($+{num}), + pred => $+{pred}, + op => $+{op}, + ins => normalizeSpacing($+{op} . $+{rest}), + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + code => hex($+{code}), + }; + } + return undef; +} + +sub processSassCtrlLine +{ + my ($line, $ctrl, $ruse) = @_; + + return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)'; + + my $code = hex($1); + if (ref $ctrl) + { + push @$ctrl, ($code & 0x000000000001ffff) >> 0; + push @$ctrl, ($code & 0x0000003fffe00000) >> 21; + push @$ctrl, ($code & 0x07fffc0000000000) >> 42; + } + if (ref $ruse) + { + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + } + return 1; +} + +sub replaceXMADs +{ + my $file = shift; + +# XMAD.LO d, a, b, c, x; +# ---------------------- +# XMAD.MRG x, a, b.H1, RZ; +# XMAD d, a, b, c; +# XMAD.PSL.CBCC d, a.H1, x.H1, d; +# ---------------------- +# XMAD d, a, 0xffff, c; +# XMAD.PSL d, a.H1, 0xffff, d; + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD\.LO\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s +%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s; +%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c x comment)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?-?$immed|\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/ + + die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + #TODO: add more XMAD macros + return $file; +} +# convert extra spaces to single spacing to make our re's simplier +sub normalizeSpacing +{ + my $inst = shift; + $inst =~ s/\t/ /g; + $inst =~ s/\s{2,}/ /g; + return $inst; +} + + +# map binary control notation on to easier to work with format. +sub printCtrl +{ + my $code = shift; + + my $stall = ($code & 0x0000f) >> 0; + my $yield = ($code & 0x00010) >> 4; + my $wrtdb = ($code & 0x000e0) >> 5; # write dependency barier + my $readb = ($code & 0x00700) >> 8; # read dependency barier + my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier + + $yield = $yield ? '-' : 'Y'; + $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1; + $readb = $readb == 7 ? '-' : $readb + 1; + $watdb = $watdb ? sprintf('%02x', $watdb) : '--'; + + return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall; +} +sub readCtrl +{ + my ($ctrl, $context) = @_; + my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl; + + $watdb = $watdb eq '--' ? 0 : hex $watdb; + $readb = $readb eq '-' ? 7 : $readb - 1; + $wrtdb = $wrtdb eq '-' ? 7 : $wrtdb - 1; + $yield = $yield eq 'y' || $yield eq 'Y' ? 0 : 1; + $stall = hex $stall; + + die sprintf('wait dep out of range(0x00-0x3f): %x at %s', $watdb, $context) if $watdb != ($watdb & 0x3f); + + return + $watdb << 11 | + $readb << 8 | + $wrtdb << 5 | + $yield << 4 | + $stall << 0; +} + +sub getRegNum +{ + my ($regMap, $regName) = @_; + + return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName}; +} + +sub getVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r0} or return; + + return if $regName eq 'RZ'; + + if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+3); + } + confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4; + return @{$vectors->{$regName}}; + } + return $regName; +} + +sub getAddrVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r8} or return; + + return if $regName eq 'RZ'; + + if (exists $capData->{E}) + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + print Dumper($vectors) unless exists $vectors->{$regName}; + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + return $regName; +} + +__END__ + + + diff --git a/Assembler/PascalAs/blib/lib/auto/MaxAs/MaxAs/.exists b/Assembler/PascalAs/blib/lib/auto/MaxAs/MaxAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/lib/auto/PascalAs/PascalAs/.exists b/Assembler/PascalAs/blib/lib/auto/PascalAs/PascalAs/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/man1/.exists b/Assembler/PascalAs/blib/man1/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/man3/.exists b/Assembler/PascalAs/blib/man3/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/man3/MaxAs::MaxAs.3pm b/Assembler/PascalAs/blib/man3/MaxAs::MaxAs.3pm new file mode 100644 index 0000000..9f95fff --- /dev/null +++ b/Assembler/PascalAs/blib/man3/MaxAs::MaxAs.3pm @@ -0,0 +1,170 @@ +.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.13) +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" Set up some character translations and predefined strings. \*(-- will +.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left +.\" double quote, and \*(R" will give a right double quote. \*(C+ will +.\" give a nicer C++. Capital omega is used to do unbreakable dashes and +.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, +.\" nothing in troff, for use with C<>. +.tr \(*W- +.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' +.ie n \{\ +. ds -- \(*W- +. ds PI pi +. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch +. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch +. ds L" "" +. ds R" "" +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds -- \|\(em\| +. ds PI \(*p +. ds L" `` +. ds R" '' +'br\} +.\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" +.\" If the F register is turned on, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.ie \nF \{\ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. nr % 0 +. rr F +.\} +.el \{\ +. de IX +.. +.\} +.\" +.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). +.\" Fear. Run. Save yourself. No user-serviceable parts. +. \" fudge factors for nroff and troff +.if n \{\ +. ds #H 0 +. ds #V .8m +. ds #F .3m +. ds #[ \f1 +. ds #] \fP +.\} +.if t \{\ +. ds #H ((1u-(\\\\n(.fu%2u))*.13m) +. ds #V .6m +. ds #F 0 +. ds #[ \& +. ds #] \& +.\} +. \" simple accents for nroff and troff +.if n \{\ +. ds ' \& +. ds ` \& +. ds ^ \& +. ds , \& +. ds ~ ~ +. ds / +.\} +.if t \{\ +. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" +. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' +. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' +. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' +. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' +. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' +.\} +. \" troff and (daisy-wheel) nroff accents +.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' +.ds 8 \h'\*(#H'\(*b\h'-\*(#H' +.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] +.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' +.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' +.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] +.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] +.ds ae a\h'-(\w'a'u*4/10)'e +.ds Ae A\h'-(\w'A'u*4/10)'E +. \" corrections for vroff +.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' +.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' +. \" for low resolution devices (crt and lpr) +.if \n(.H>23 .if \n(.V>19 \ +\{\ +. ds : e +. ds 8 ss +. ds o a +. ds d- d\h'-1'\(ga +. ds D- D\h'-1'\(hy +. ds th \o'bp' +. ds Th \o'LP' +. ds ae ae +. ds Ae AE +.\} +.rm #[ #] #H #V #F C +.\" ======================================================================== +.\" +.IX Title "MaxAs::MaxAs 3" +.TH MaxAs::MaxAs 3 "2016-02-04" "perl v5.10.1" "User Contributed Perl Documentation" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh +.SH "NAME" +MaxAs::MaxAs \- Assembler for NVIDIA Maxwell architecture +.SH "SYNOPSIS" +.IX Header "SYNOPSIS" +.Vb 1 +\& maxas.pl [opts] +.Ve +.SH "DESCRIPTION" +.IX Header "DESCRIPTION" +See the documentation at: https://github.com/NervanaSystems/maxas +.SH "SEE ALSO" +.IX Header "SEE ALSO" +See the documentation at: https://github.com/NervanaSystems/maxas +.SH "AUTHOR" +.IX Header "AUTHOR" +Scott Gray, +.SH "COPYRIGHT AND LICENSE" +.IX Header "COPYRIGHT AND LICENSE" +The \s-1MIT\s0 License (\s-1MIT\s0) +.PP +Copyright (c) 2014 Scott Gray +.PP +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the \*(L"Software\*(R"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +.PP +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +.PP +\&\s-1THE\s0 \s-1SOFTWARE\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R", \s-1WITHOUT\s0 \s-1WARRANTY\s0 \s-1OF\s0 \s-1ANY\s0 \s-1KIND\s0, \s-1EXPRESS\s0 \s-1OR\s0 +\&\s-1IMPLIED\s0, \s-1INCLUDING\s0 \s-1BUT\s0 \s-1NOT\s0 \s-1LIMITED\s0 \s-1TO\s0 \s-1THE\s0 \s-1WARRANTIES\s0 \s-1OF\s0 \s-1MERCHANTABILITY\s0, +\&\s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0 \s-1AND\s0 \s-1NONINFRINGEMENT\s0. \s-1IN\s0 \s-1NO\s0 \s-1EVENT\s0 \s-1SHALL\s0 \s-1THE\s0 +\&\s-1AUTHORS\s0 \s-1OR\s0 \s-1COPYRIGHT\s0 \s-1HOLDERS\s0 \s-1BE\s0 \s-1LIABLE\s0 \s-1FOR\s0 \s-1ANY\s0 \s-1CLAIM\s0, \s-1DAMAGES\s0 \s-1OR\s0 \s-1OTHER\s0 +\&\s-1LIABILITY\s0, \s-1WHETHER\s0 \s-1IN\s0 \s-1AN\s0 \s-1ACTION\s0 \s-1OF\s0 \s-1CONTRACT\s0, \s-1TORT\s0 \s-1OR\s0 \s-1OTHERWISE\s0, \s-1ARISING\s0 \s-1FROM\s0, +\&\s-1OUT\s0 \s-1OF\s0 \s-1OR\s0 \s-1IN\s0 \s-1CONNECTION\s0 \s-1WITH\s0 \s-1THE\s0 \s-1SOFTWARE\s0 \s-1OR\s0 \s-1THE\s0 \s-1USE\s0 \s-1OR\s0 \s-1OTHER\s0 \s-1DEALINGS\s0 \s-1IN\s0 +\&\s-1THE\s0 \s-1SOFTWARE\s0. diff --git a/Assembler/PascalAs/blib/man3/PascalAs::PascalAs.3pm b/Assembler/PascalAs/blib/man3/PascalAs::PascalAs.3pm new file mode 100644 index 0000000..22de6a2 --- /dev/null +++ b/Assembler/PascalAs/blib/man3/PascalAs::PascalAs.3pm @@ -0,0 +1,117 @@ +.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29) +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" Set up some character translations and predefined strings. \*(-- will +.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left +.\" double quote, and \*(R" will give a right double quote. \*(C+ will +.\" give a nicer C++. Capital omega is used to do unbreakable dashes and +.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, +.\" nothing in troff, for use with C<>. +.tr \(*W- +.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' +.ie n \{\ +. ds -- \(*W- +. ds PI pi +. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch +. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch +. ds L" "" +. ds R" "" +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds -- \|\(em\| +. ds PI \(*p +. ds L" `` +. ds R" '' +. ds C` +. ds C' +'br\} +.\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" +.\" If the F register is turned on, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.\" +.\" Avoid warning from groff about undefined register 'F'. +.de IX +.. +.nr rF 0 +.if \n(.g .if rF .nr rF 1 +.if (\n(rF:(\n(.g==0)) \{ +. if \nF \{ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. if !\nF==2 \{ +. nr % 0 +. nr F 2 +. \} +. \} +.\} +.rr rF +.\" ======================================================================== +.\" +.IX Title "PascalAs::PascalAs 3pm" +.TH PascalAs::PascalAs 3pm "2018-11-05" "perl v5.22.1" "User Contributed Perl Documentation" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh +.SH "NAME" +PascalAs::PascalAs \- Assembler for NVIDIA Maxwell architecture +.SH "SYNOPSIS" +.IX Header "SYNOPSIS" +.Vb 1 +\& Pascalas.pl [opts] +.Ve +.SH "DESCRIPTION" +.IX Header "DESCRIPTION" +See the documentation at: https://github.com/NervanaSystems/pascalas +.SH "SEE ALSO" +.IX Header "SEE ALSO" +See the documentation at: https://github.com/NervanaSystems/pascalas +.SH "AUTHOR" +.IX Header "AUTHOR" +Scott Gray, +.SH "COPYRIGHT AND LICENSE" +.IX Header "COPYRIGHT AND LICENSE" +The \s-1MIT\s0 License (\s-1MIT\s0) +.PP +Copyright (c) 2014 Scott Gray +.PP +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the \*(L"Software\*(R"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +.PP +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +.PP +\&\s-1THE SOFTWARE IS PROVIDED \*(L"AS IS\*(R", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE.\s0 diff --git a/Assembler/PascalAs/blib/script/.exists b/Assembler/PascalAs/blib/script/.exists new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/blib/script/maxas.pl b/Assembler/PascalAs/blib/script/maxas.pl new file mode 100755 index 0000000..91cfa30 --- /dev/null +++ b/Assembler/PascalAs/blib/script/maxas.pl @@ -0,0 +1,289 @@ +#!/usr/bin/perl + +eval 'exec /usr/bin/perl -S $0 ${1+"$@"}' + if 0; # not running under some shell +use strict; +use MaxAs::Cubin; +use MaxAs::MaxAs; +use Data::Dumper; +use File::Spec; + +require 5.10.0; + +$Data::Dumper::Sortkeys = 1; + +my $mode = shift; + +# List cubin contents +if ($mode =~ /^\-?\-l/i) +{ + my $cubinFile = shift or usage(); + + my $cubin = MaxAs::Cubin->new($cubinFile); + + my $arch = $cubin->arch; + my $class = $cubin->class; + my $asize = $cubin->address_size; + my $kernels = $cubin->listKernels; + my $symbols = $cubin->listSymbols; + + printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize; + + foreach my $ker (sort keys %$kernels) + { + printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)}; + } + foreach my $sym (sort keys %$symbols) + { + printf "Symbol: %s\n", $sym; + } +} +# Test that the assembler can reproduce the op codes this cubin or sass contains +elsif ($mode =~ /^\-?\-t/i) +{ + my $reg = shift if $ARGV[0] =~ /^\-?\-r/i; + my $all = shift if $ARGV[0] =~ /^\-?\-a/i; + my $file = shift or usage(); + my $fh; + # sass file + if (-T $file) + { + open $fh, $file or die "$file: $!"; + } + # cubin file + else + { + my $cubin = MaxAs::Cubin->new($file); + my $arch = $cubin->arch; + + open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!"; + my $first = <$fh>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + } + exit(MaxAs::MaxAs::Test($fh, $reg, $all) ? 1 : 0); +} +# Extract an asm file containing the desired kernel +elsif ($mode =~ /^\-?\-e/i) +{ + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $cubinFile = shift or usage(); + my $asmFile = shift; + my $cubin = MaxAs::Cubin->new($cubinFile); + my $arch = $cubin->arch; + my $kernels = $cubin->listKernels; + + #default the kernel name if not specified. + $kernelName ||= (sort keys %$kernels)[0]; + + my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName"; + + open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_50 -sass -fun $kernelName $cubinFile: $!"; + my $first = <$in>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n"; + + print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt)); + + print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n"; + + print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params}; + + print $out "#\n# Instructions:\n\n"; + + MaxAs::MaxAs::Extract($in, $out, $kernel->{Params}); + + close $out if $asmFile; + close $in; +} +# Extract a kernel from a sass dump +elsif ($mode =~ /^\-?\-s/i) +{ + my $sassFile = shift or usage(); + my $asmFile = shift; + + open my $in, $sassFile or die "$sassFile: $!"; + + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + MaxAs::MaxAs::Extract($in, $out, []); + + close $out if $asmFile; + close $in; +} +# Insert the kernel asm back into the cubin: +elsif ($mode =~ /^\-?\-i/i) +{ + my $nowarn; + if ($ARGV[0] =~ /^\-?\-w/i) + { + $nowarn = shift; + } + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $noReuse = shift if $ARGV[0] =~ /^\-?\-n/i; + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';" + } + + my $asmFile = shift or usage(); + my $cubinFile = shift or usage(); + my $newCubin = shift || $cubinFile; + + my $file; + if (open my $fh, $asmFile) + { + local $/; + $file = <$fh>; + close $fh; + } + else { die "$asmFile: $!" } + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + # extract the kernel name from the file + ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName; + die "asm file missing kernel name or is badly formatted" unless $kernelName; + + my $kernel = MaxAs::MaxAs::Assemble($file, $include, !$noReuse, $nowarn); + + my $cubin = MaxAs::Cubin->new($cubinFile); + $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName"; + + $cubin->modifyKernel(%$kernel); + + $cubin->write($newCubin); + + printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n", + @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)}; + +} +# Preprocessing: +elsif ($mode =~ /^\-?\-p/i) +{ + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"; + } + my $debug = shift if $ARGV[0] =~ /^\-?\-d/i; + my $asmFile = shift or usage(); + my $asmFile2 = shift; + + die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2; + + open my $fh, $asmFile or die "$asmFile: $!"; + local $/; + my $file = <$fh>; + close $fh; + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + if ($asmFile2) + { + open $fh, ">$asmFile2" or die "$asmFile2: $!"; + } + else + { + $fh = \*STDOUT; + } + print $fh MaxAs::MaxAs::Preprocess($file, $include, $debug); + close $fh; +} +# get version information +elsif ($mode =~ /^\-?\-v/i) +{ + print "$MaxAs::MaxAs::VERSION\n"; +} +else +{ + print "$mode\n"; + usage(); +} + +exit(0); + + + +sub usage +{ + print < + + Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes. + Also useful for extending the missing grammar rules. Defaults to only showing failures without --all. + With the --reg flag it will show register bank conflicts not hidden by reuse flags. + + maxas.pl --test|-t [--reg|-r] [--all|-a] + + Extract a single kernel into an asm file from a cubin. + Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin. + + maxas.pl --extract|-e [--kernel|-k kernel_name] [asm_file] + + Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes. + Include the debug flag to print out detailed scheduler info. + + maxas.pl --pre|-p [--debug|-d] [new_asm_file] + + Insert the kernel asm back into the cubin. Overwrite existing or create new cubin. + Optionally you can skip register reuse flag auto insertion. This allows you to observe + performance without any reuse or you can use it to set the flags manually in your sass. + + maxas.pl --insert|-i [--noreuse|-n] [new_cubin_file] + + Display version information and exit: + + maxas.pl --version|-v + +EOF + exit(1); +} + +__END__ diff --git a/Assembler/PascalAs/blib/script/pascalas.pl b/Assembler/PascalAs/blib/script/pascalas.pl new file mode 100755 index 0000000..a0f1372 --- /dev/null +++ b/Assembler/PascalAs/blib/script/pascalas.pl @@ -0,0 +1,286 @@ +#!/usr/bin/perl +use strict; +use PascalAs::Cubin; +use PascalAs::PascalAs; +use Data::Dumper; +use File::Spec; + +require 5.10.0; + +$Data::Dumper::Sortkeys = 1; + +my $mode = shift; + +# List cubin contents +if ($mode =~ /^\-?\-l/i) +{ + my $cubinFile = shift or usage(); + + my $cubin = PascalAs::Cubin->new($cubinFile); + + my $arch = $cubin->arch; + my $class = $cubin->class; + my $asize = $cubin->address_size; + my $kernels = $cubin->listKernels; + my $symbols = $cubin->listSymbols; + + printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize; + + foreach my $ker (sort keys %$kernels) + { + printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)}; + } + foreach my $sym (sort keys %$symbols) + { + printf "Symbol: %s\n", $sym; + } +} +# Test that the assembler can reproduce the op codes this cubin or sass contains +elsif ($mode =~ /^\-?\-t/i) +{ + my $reg = shift if $ARGV[0] =~ /^\-?\-r/i; + my $all = shift if $ARGV[0] =~ /^\-?\-a/i; + my $file = shift or usage(); + my $fh; + # sass file + if (-T $file) + { + open $fh, $file or die "$file: $!"; + } + # cubin file + else + { + my $cubin = PascalAs::Cubin->new($file); + my $arch = $cubin->arch; + + open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!"; + my $first = <$fh>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + } + exit(PascalAs::PascalAs::Test($fh, $reg, $all) ? 1 : 0); +} +# Extract an asm file containing the desired kernel +elsif ($mode =~ /^\-?\-e/i) +{ + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $cubinFile = shift or usage(); + my $asmFile = shift; + my $cubin = PascalAs::Cubin->new($cubinFile); + my $arch = $cubin->arch; + my $kernels = $cubin->listKernels; + + #default the kernel name if not specified. + $kernelName ||= (sort keys %$kernels)[0]; + + my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName"; + + open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_60 -sass -fun $kernelName $cubinFile: $!"; + my $first = <$in>; + if ($first =~ /cuobjdump fatal/) + { + print $first; + exit(1); + } + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n"; + + print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt)); + + print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n"; + + print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params}; + + print $out "#\n# Instructions:\n\n"; + + PascalAs::PascalAs::Extract($in, $out, $kernel->{Params}); + + close $out if $asmFile; + close $in; +} +# Extract a kernel from a sass dump +elsif ($mode =~ /^\-?\-s/i) +{ + my $sassFile = shift or usage(); + my $asmFile = shift; + + open my $in, $sassFile or die "$sassFile: $!"; + + my $out; + if ($asmFile) + { + open $out, ">$asmFile" or die "$asmFile: $!"; + } + else + { + $out = \*STDOUT; + } + + PascalAs::PascalAs::Extract($in, $out, []); + + close $out if $asmFile; + close $in; +} +# Insert the kernel asm back into the cubin: +elsif ($mode =~ /^\-?\-i/i) +{ + my $nowarn; + if ($ARGV[0] =~ /^\-?\-w/i) + { + $nowarn = shift; + } + my $kernelName; + if ($ARGV[0] =~ /^\-?\-k/i) + { + shift; + $kernelName = shift or usage(); + } + my $noReuse = shift if $ARGV[0] =~ /^\-?\-n/i; + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package PascalAs::PascalAs::CODE; our \$$name = '$value';" + } + + my $asmFile = shift or usage(); + my $cubinFile = shift or usage(); + my $newCubin = shift || $cubinFile; + + my $file; + if (open my $fh, $asmFile) + { + local $/; + $file = <$fh>; + close $fh; + } + else { die "$asmFile: $!" } + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + # extract the kernel name from the file + ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName; + die "asm file missing kernel name or is badly formatted" unless $kernelName; + + my $kernel = PascalAs::PascalAs::Assemble($file, $include, !$noReuse, $nowarn); + + my $cubin = PascalAs::Cubin->new($cubinFile); + $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName"; + + $cubin->modifyKernel(%$kernel); + + $cubin->write($newCubin); + + printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n", + @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)}; + +} +# Preprocessing: +elsif ($mode =~ /^\-?\-p/i) +{ + while ($ARGV[0] =~ /^\-?\-D(\w+)/) + { + shift; + my $name = $1; + my $value = shift; + eval "package PascalAs::PascalAs::CODE; our \$$name = '$value';"; + } + my $debug = shift if $ARGV[0] =~ /^\-?\-d/i; + my $asmFile = shift or usage(); + my $asmFile2 = shift; + + die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2; + + open my $fh, $asmFile or die "$asmFile: $!"; + local $/; + my $file = <$fh>; + close $fh; + + my ($vol,$dir) = File::Spec->splitpath($asmFile); + my $include = [$vol, $dir]; + + if ($asmFile2) + { + open $fh, ">$asmFile2" or die "$asmFile2: $!"; + } + else + { + $fh = \*STDOUT; + } + print $fh PascalAs::PascalAs::Preprocess($file, $include, $debug); + close $fh; +} +# get version information +elsif ($mode =~ /^\-?\-v/i) +{ + print "$PascalAs::PascalAs::VERSION\n"; +} +else +{ + print "$mode\n"; + usage(); +} + +exit(0); + + + +sub usage +{ + print < + + Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes. + Also useful for extending the missing grammar rules. Defaults to only showing failures without --all. + With the --reg flag it will show register bank conflicts not hidden by reuse flags. + + pascalas.pl --test|-t [--reg|-r] [--all|-a] + + Extract a single kernel into an asm file from a cubin. + Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin. + + pascalas.pl --extract|-e [--kernel|-k kernel_name] [asm_file] + + Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes. + Include the debug flag to print out detailed scheduler info. + + pascalas.pl --pre|-p [--debug|-d] [new_asm_file] + + Insert the kernel asm back into the cubin. Overwrite existing or create new cubin. + Optionally you can skip register reuse flag auto insertion. This allows you to observe + performance without any reuse or you can use it to set the flags manually in your sass. + + pascalas.pl --insert|-i [--noreuse|-n] [new_cubin_file] + + Display version information and exit: + + pascalas.pl --version|-v + +EOF + exit(1); +} + +__END__ diff --git a/Assembler/PascalAs/cpanfile b/Assembler/PascalAs/cpanfile new file mode 100644 index 0000000..e8281c5 --- /dev/null +++ b/Assembler/PascalAs/cpanfile @@ -0,0 +1,4 @@ +requires 'perl', '5.10.0'; + +requires 'Carp', '1.29'; +requires 'Data::Dumper', '2.145'; diff --git a/Assembler/PascalAs/lib/PascalAs/Cubin.pm b/Assembler/PascalAs/lib/PascalAs/Cubin.pm new file mode 100644 index 0000000..10bf9a8 --- /dev/null +++ b/Assembler/PascalAs/lib/PascalAs/Cubin.pm @@ -0,0 +1,686 @@ +package PascalAs::Cubin; + +use strict; +use Data::Dumper; + +my @Elf32_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + L entry + L phOffset + L shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf64_Hdr = qw( + H8 magic + C fileClass + C encoding + C fileVersion + H18 padding + S type + S machine + L version + Q entry + Q phOffset + Q shOffset + L flags + S ehSize + S phEntSize + S phNum + S shEntSize + S shNum + S shStrIndx +); +my @Elf32_PrgHdr = qw( + L type + L offset + L vaddr + L paddr + L fileSize + L memSize + L flags + L align +); +my @Elf64_PrgHdr = qw( + L type + L flags + Q offset + Q vaddr + Q paddr + Q fileSize + Q memSize + Q align +); +my @Elf32_SecHdr = qw( + L name + L type + L flags + L addr + L offset + L size + L link + L info + L align + L entSize +); +my @Elf64_SecHdr = qw( + L name + L type + Q flags + Q addr + Q offset + Q size + L link + L info + Q align + Q entSize +); +my @Elf32_SymEnt = qw( + L name + L value + L size + C info + C other + S shIndx +); +my @Elf64_SymEnt = qw( + L name + C info + C other + S shIndx + Q value + Q size +); +my @symBind = qw(LOCAL GLOBAL WEAK); + +# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C) +my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC); + +$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr; +$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr; +$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr; +$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt; + +$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr; +$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr; +$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr; +$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt; + +$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr ]; +$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ]; +$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ]; +$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ]; + +$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr ]; +$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ]; +$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ]; +$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ]; + +# Load a cubin ELF file +sub new +{ + my ($package, $file) = @_; + + my $cubin = bless { fileName => $file }, $package; + + open my $fh, $file or die "$file: $!"; + binmode($fh); + + # Read in assuming 32 bit header + my $data; + read $fh, $data, 0x34; + my $elfHdr = $cubin->{elfHdr} = {}; + @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data; + + # 1: 32bit, 2: 64bit + my $class = $elfHdr->{fileClass}; + + # re-read in with 64 bit header if needed + if ($class == 2) + { + seek $fh, 0, 0; + read $fh, $data, 0x46; + @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data; + + $cubin->{Class} = 64; + } + else + { + $cubin->{Class} = 32; + } + + # verify sm_60 cubin + #$cubin->{Arch} = $elfHdr->{flags} & 0xFF; + #die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50; + + $cubin->{Arch} = "60"; + die "Cubin not in sm_60. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} != 60; + $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32; + + # Read in Program Headers + seek $fh, $elfHdr->{phOffset}, 0; + foreach (1 .. $elfHdr->{phNum}) + { + read $fh, $data, $elfHdr->{phEntSize}; + + my %prgHdr = (Indx => $_ - 1); + @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data; + push @{$cubin->{prgHdrs}}, \%prgHdr; + } + + # Read in Section Headers + seek $fh, $elfHdr->{shOffset}, 0; + foreach (1 .. $elfHdr->{shNum}) + { + read $fh, $data, $elfHdr->{shEntSize}; + + my %secHdr = (Indx => $_ - 1); + @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data; + push @{$cubin->{secHdrs}}, \%secHdr; + } + + # Read in Section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $data = ''; + # Skip sections with no data (type NULL or NOBITS) + if ($secHdr->{size} && $secHdr->{type} != 8) + { + seek $fh, $secHdr->{offset}, 0; + read $fh, $data, $secHdr->{size}; + } + # Convert string tables to maps + if ($secHdr->{type} == 3) # STRTAB + { + my $strTab = $secHdr->{StrTab} = {}; + my $indx = 0; + foreach my $str (split "\0", $data) + { + $strTab->{$indx} = $str; + $indx += 1 + length($str); + } + } + # Read in Symbol data + if ($secHdr->{type} == 2) # SYMTAB + { + my $offset = 0; + while ($offset < $secHdr->{size}) + { + my $symEnt = {}; + @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize}); + $offset += $secHdr->{entSize}; + + push @{$secHdr->{SymTab}}, $symEnt; + } + } + # Cache raw data for further processing and writing + $secHdr->{Data} = unpack 'H*', $data; + } + close $fh; + + # Update section headers with their names. Map names directly to headers. + my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab}; + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + $secHdr->{Name} = $shStrTab->{$secHdr->{name}}; + $cubin->{$secHdr->{Name}} = $secHdr; + } + + # Update symbols with their names + # For the Global functions, extract kernel meta data + # Populate the kernel hash + my $strTab = $cubin->{'.strtab'}{StrTab}; + foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}}) + { + $symEnt->{Name} = $strTab->{$symEnt->{name}}; + + # Attach symbol to section + my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}]; + $secHdr->{SymbolEnt} = $symEnt; + + # Look for symbols tagged FUNC + if (($symEnt->{info} & 0x0f) == 0x02) + { + # Create a hash of kernels for output + my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr; + + # Extract local/global/weak binding info + $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4]; + + # Extract the kernel instructions + $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ]; + + # Extract the max barrier resource identifier used and add 1. Should be 0-16. + # If a register is used as a barrier resource id, then this value is the max of 16. + $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20; + + # Extract the number of allocated registers for this kernel. + $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24; + + # Extract the size of shared memory this kernel uses. + my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"}; + $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0; + + # Attach constant0 section + $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"}; + + # Extract the kernel parameter data. + my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"}; + if ($paramSec) + { + # Extract raw param data + my @data = unpack "L*", pack "H*", $paramSec->{Data}; + + $paramSec->{ParamData} = \@data; + $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ]; + + # Find the first param delimiter + my $idx = 0; + $idx++ while $idx < @data && $data[$idx] != 0x00080a04; + + my $first = $data[$idx+2] & 0xFFFF; + #my $size = $data[$idx+2] >> 16; + $idx += 4; + + my @params; + while ($idx < @data && $data[$idx] == 0x000c1704) + { + # Get the ordinal, offset, size and pointer alignment for each param + my $ord = $data[$idx+2] & 0xFFFF; + my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16); + my $psize = $data[$idx+3] >> 18; + my $align = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0; + unshift @params, "$ord:$offset:$psize:$align"; + $idx += 4; + } + my @staticParams = @data[0 .. ($idx-1)]; + + my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize); + while ($idx < @data) + { + my $code = $data[$idx] & 0xffff; + my $size = $data[$idx] >> 16; + $idx++; + + # EIATTR_MAXREG_COUNT + if ($code == 0x1b03) + { + $maxregCount = $size; + } + # EIATTR_S2RCTAID_INSTR_OFFSETS + elsif ($code == 0x1d04) + { + while ($size > 0) + { + push @ctaidOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_EXIT_INSTR_OFFSETS + elsif ($code == 0x1c04) + { + while ($size > 0) + { + push @exitOffsets, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CTAIDZ_USED + elsif ($code == 0x0401) + { + $ctaidzUsed = 1; + } + # EIATTR_REQNTID + elsif ($code == 0x1004) + { + while ($size > 0) + { + push @reqntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_MAX_THREADS + elsif ($code == 0x0504) + { + while ($size > 0) + { + push @maxntid, $data[$idx++]; + $size -= 4; + } + } + # EIATTR_CRS_STACK_SIZE + elsif ($code == 0x1e04) + { + while ($size > 0) + { + push @stackSize, $data[$idx++]; + $size -= 4; + } + } + else + { + printf "Unknown Code 0x%02x (size:%d)\n", $code, $size; + } + } + $kernelSec->{Params} = \@params; + $kernelSec->{ParamCnt} = scalar @params; + + $paramSec->{StaticParams} = \@staticParams; + $paramSec->{MAXREG_COUNT} = $maxregCount; + $paramSec->{ExitOffsets} = \@exitOffsets; + $paramSec->{CTAIDOffsets} = \@ctaidOffsets; + $paramSec->{CTAIDZUsed} = $ctaidzUsed; + $paramSec->{REQNTID} = \@reqntid; + $paramSec->{MAXNTID} = \@maxntid; + $paramSec->{STACKSIZE} = \@stackSize; + } + # print Dumper($paramSec); + # exit(); + } + # Note GLOBALs found in this cubin + elsif (($symEnt->{info} & 0x10) == 0x10) + { + $cubin->{Symbols}{$symEnt->{Name}} = $symEnt; + } + } + + # print "phOffset: $elfHdr->{phOffset}\n"; + # print "shOffset: $elfHdr->{shOffset}\n"; + # foreach my $secHdr (@{$cubin->{secHdrs}}) + # { + # print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n"; + # } + # my $p = 0; + # foreach my $prgHdr (@{$cubin->{prgHdrs}}) + # { + # print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n"; + # $p++; + # } + # exit(); + + # print Dumper($cubin->{prgHdrs}); + # exit(); + return $cubin; +} +sub class +{ + return shift()->{Class}; +} +sub arch +{ + return shift()->{Arch}; +} +sub address_size +{ + return shift()->{AddressSize}; +} +sub listKernels +{ + return shift()->{Kernels}; +} +sub listSymbols +{ + return shift()->{Symbols}; +} +sub getKernel +{ + my ($cubin, $kernel) = @_; + return $cubin->{Kernels}{$kernel}; +} + +sub modifyKernel +{ + my ($cubin, %params) = @_; + + my $kernelSec = $params{Kernel}; + my $newReg = $params{RegCnt}; + my $newBar = $params{BarCnt}; + my $exitOffsets = $params{ExitOffsets}; + my $ctaidOffsets = $params{CTAIDOffsets}; + my $ctaidzUsed = $params{CTAIDZUsed}; + my $newData = $params{KernelData}; + my $newSize = @$newData * 8; + + die "255 register max" if $newReg > 255; + die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63; + die "16 is max barrier count" if $newBar > 16; + + my $paramSec = $kernelSec->{ParamSec}; + my $kernelName = $kernelSec->{SymbolEnt}{Name}; + my $maxregCount = $paramSec->{MAXREG_COUNT}; + my $stackSize = $paramSec->{STACKSIZE}; + + # update the kernel + $kernelSec->{KernelData} = $newData; + $kernelSec->{Data} = unpack "H*", pack "Q*", @$newData; + + if ($newReg != $kernelSec->{RegCnt}) + { + print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n"; + $kernelSec->{RegCnt} = $newReg; + $kernelSec->{info} &= ~0xff000000; + $kernelSec->{info} |= $newReg << 24; + } + if ($newBar != $kernelSec->{BarCnt}) + { + print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n"; + $kernelSec->{BarCnt} = $newBar; + $kernelSec->{flags} &= ~0x01f00000; + $kernelSec->{flags} |= $newBar << 20; + } + + my @paramData = @{$paramSec->{StaticParams}}; + + if (defined $maxregCount) + { + push @paramData, ($maxregCount << 16) | 0x1b03; + } + + my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets; + my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}}; + + if ($newCTAIDs ne $oldCTAIDs) + { + print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n"; + } + if (@$ctaidOffsets) + { + push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04; + push @paramData, @$ctaidOffsets; + } + + my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets; + my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}}; + + if ($newExits ne $oldExits) + { + print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n"; + } + if (@$exitOffsets) + { + push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04; + push @paramData, @$exitOffsets; + } + + if ($ctaidzUsed != $paramSec->{CTAIDZUsed}) + { + print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n"; + } + if ($ctaidzUsed) + { + push @paramData, 0x0401; + } + + if (@{$paramSec->{REQNTID}}) + { + push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004; + push @paramData, @{$paramSec->{REQNTID}}; + } + if (@{$paramSec->{MAXNTID}}) + { + push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504; + push @paramData, @{$paramSec->{MAXNTID}}; + } + + if (@$stackSize) + { + push @paramData, (scalar(@$stackSize) << 18) | 0x1e04; + push @paramData, @$stackSize; + } + + my $newParamSize = scalar(@paramData)*4; + $paramSec->{Data} = unpack "H*", pack "L*", @paramData; + if ($newParamSize != $paramSec->{size}) + { + print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n"; + $cubin->updateSize($paramSec, $newParamSize); + } + + if ($newSize != $kernelSec->{size}) + { + print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n"; + $cubin->updateSize($kernelSec, $newSize, 1); + } +} + +sub updateSize +{ + my ($cubin, $sec, $newSize, $updatePrgSize) = @_; + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # update section header + my $delta = $newSize - $sec->{size}; + $sec->{size} = $newSize; + + # update symtab section + if ($sec->{SymbolEnt}) + { + $sec->{SymbolEnt}{size} = $newSize; + my $symSection = $cubin->{'.symtab'}; + $symSection->{Data} = ''; + foreach my $symEnt (@{$symSection->{SymTab}}) + { + $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}}; + } + } + + my $pos = $elfHdr->{ehSize}; + my %sizeMap; + + # update section header offsets + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # skip first header + next if $secHdr->{align} == 0; + + # NOBITS data sections are size 0 + my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size}; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pos += $secHdr->{align} - $pad; + } + # map old offset to new + $sizeMap{$secHdr->{offset}} = $pos; + + # update offset + $secHdr->{offset} = $pos; + + # advance position by size + $pos += $size; + } + + # compute total section header size + my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset}; + + # map old offset to new + $sizeMap{$elfHdr->{shOffset}} = $pos; + $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize; + + $elfHdr->{shOffset} = $pos; + $elfHdr->{phOffset} = $pos + $shSize; + + # update program header offsets and sizes + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + # Not sure how best to adjust these so just assume they'll track other offsets. + $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}}; + + # If the kernel sizes changes, also update the associated ProgramHeader. + # Note that this size is the kernel size plus any constant section sizes. + if ($updatePrgSize && $prgHdr->{type} == 1 && + $sec->{offset} >= $prgHdr->{offset} && + $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta) + { + $prgHdr->{fileSize} += $delta; + $prgHdr->{memSize} += $delta; + } + } +} + +# Write out the cubin after modifying it. +sub write +{ + my ($cubin, $file) = @_; + + open my $fh, ">$file" or die "Error: could not open $file for writing: $!"; + binmode($fh); + + my $elfHdr = $cubin->{elfHdr}; + my $class = $elfHdr->{fileClass}; + + # write elf header + print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}}; + my $pos = $elfHdr->{ehSize}; + + # write section data + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + # Skip NULL and NOBITS data sections + next if $secHdr->{size} == 0 || $secHdr->{type} == 8; + + # Add any needed padding between sections + my $pad = $pos % $secHdr->{align}; + if ($pad > 0) + { + $pad = $secHdr->{align} - $pad; + print $fh join '', "\0" x $pad; + $pos += $pad; + } + + print $fh pack 'H*', $secHdr->{Data}; + $pos += $secHdr->{size}; + } + + # write section headers + foreach my $secHdr (@{$cubin->{secHdrs}}) + { + print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}}; + } + + #write program headers + foreach my $prgHdr (@{$cubin->{prgHdrs}}) + { + print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}}; + } + close $fh; +} + +__END__ + diff --git a/Assembler/PascalAs/lib/PascalAs/PascalAs.pm b/Assembler/PascalAs/lib/PascalAs/PascalAs.pm new file mode 100644 index 0000000..eefcdf6 --- /dev/null +++ b/Assembler/PascalAs/lib/PascalAs/PascalAs.pm @@ -0,0 +1,1407 @@ +package PascalAs::PascalAs; + +require 5.10.0; + +use strict; +use Data::Dumper; +use PascalAs::PascalAsGrammar; +use File::Spec; +use Carp; + +our $VERSION = '1.06'; + +# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump +my %relOffset = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT); + +# these ops use absolute addresses +my %absOffset = map { $_ => 1 } qw(JCAL); + +my %jumpOp = (%relOffset, %absOffset); + +# These instructions use r0 but do not write to r0 +my %noDest = map { $_ => 1 } qw(ST STG STS STL RED); + +# Map register slots to reuse control codes +my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4); + +# Preprocess and Assemble a source file +sub Assemble +{ + my ($file, $include, $doReuse, $nowarn) = @_; + + my $regMap = {}; + $file = Preprocess($file, $include, 0, $regMap); + my $vectors = delete $regMap->{__vectors}; + my $regBank = delete $regMap->{__regbank}; + + # initialize cubin counts + my $regCnt = 0; + my $barCnt = 0; + + my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse); + + # initialize the first control instruction + push @instructs, $ctrl = {}; + + foreach my $line (split "\n", $file) + { + # keep track of line nums in the physical file + $lineNum++; + + next unless preProcessLine($line); + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # Save us from crashing the display driver + die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n" + if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0; + + # track branches/jumps/calls/etc for label remapping + push @branches, @instructs+0 if exists $jumpOp{$inst->{op}}; + + # push the control code onto the control instruction + push @{$ctrl->{ctrl}}, $inst->{ctrl}; + + # now point the instruction to its associated control instruction + $inst->{ctrl} = $ctrl; + + # add the op name and full instruction text + push @instructs, $inst; + + # add a 4th control instruction for every 3 instructions + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + # map the label name to the index of the instruction about to be inserted + $labels{$1} = @instructs+0; + } + else + { + die "badly formed line at $lineNum: $line\n"; + } + } + # add the final BRA op and align the number of instructions to a multiple of 8 + push @{$ctrl->{ctrl}}, 0x007ff; + push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' }; + while (@instructs & 7) + { + push @instructs, $ctrl = {} if ((@instructs & 3) == 0); + push @{$ctrl->{ctrl}}, 0x007e0; + push @instructs, { op => 'NOP', inst => 'NOP;' }; + } + + # remap labels + foreach my $i (@branches) + { + if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1}) + { die "instruction has invalid label: $instructs[$i]{inst}"; } + + $instructs[$i]{jump} = $labels{$1}; + + if (exists $relOffset{$instructs[$i]{op}}) + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; } + else + { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; } + } + + # calculate optimal register reuse + # This effects register bank decisions so do it before analyzing register use + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + if ($doReuse) + { + # get any vector registers for r0 + my @r0 = getVecRegisters($vectors, $capData); + + # There are 2 reuse slots per register slot + # The reuse hash points to most recent instruction index where register was last used in this slot + + # For writes to a register, clear any reuse opportunity + if (@r0 && !exists $noDest{$op}) + { + foreach my $slot (keys %reuseSlots) + { + if (my $reuse = $reuse{$slot}) + { + # if writing with a vector op, clear all linked registers + delete $reuse->{$_} foreach @r0; + } + } + } + # clear cache if jumping elsewhere + %reuse = () if exists $jumpOp{$op}; + + # only track register reuse for instruction types this works with + if ($gram->{type}{reuse}) + { + foreach my $slot (keys %reuseSlots) + { + next unless exists $capData->{$slot}; + + my $r = $capData->{$slot}; + next if $r eq 'RZ'; + next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction + + my $reuse = $reuse{$slot} ||= {}; + + # if this register was previously marked for potential reuse + if (my $p = $reuse->{$r}) + { + # flag the previous instruction's ctrl reuse array slot + $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot}; + + #print "reuse $slot $r $instructs[$p]{inst}\n"; + } + # list full, delete the oldest + elsif (keys %$reuse > 2) + { + my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0]; + delete $reuse->{$oldest}; + } + # mark the new instruction for potential reuse + $reuse->{$r} = $i; + } + } + } + # if reuse is disabled then pull value from code. + elsif ($gram->{type}{reuse}) + { + $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData); + } + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # Assign registers to requested banks if possible + foreach my $r (sort keys %$regBank) + { + my $bank = $regBank->{$r}; + my $avail = $regMap->{$r}; + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + last; + } + } + } + + # calculate register live times and preferred banks for non-fixed registers. + # LiveTime only half implemented... + my (%liveTime, %pairedBanks, %reuseHistory); + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + my $reuseType = $gram->{type}{reuse}; + + # liveTimes and bank conflicts with source operands + my (%addReuse, %delReuse); + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r}; + + # All registers should be written prior to being read.. + if (my $liveTime = $liveTime{$liveR}) + { + # for each read set the current instruction index as the high value + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + + # Is this register active in the reuse cache? + my $slotHist = $reuseHistory{$slot} ||= {}; + my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0; + + #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3'; + + # If this is an auto reg, look at the open banks. + # No need to look at banks if this register is in the reuse cache. + if (!$selfReuse && ref $regMap->{$r}) + { + # Look at other source operands in this instruction and flag what banks are being used + foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39)) + { + my $r2 = $capData->{$slot2}; + next if $r2 eq 'RZ' || $r2 eq $r; + + my $slotHist2 = $reuseHistory{$slot2} ||= {}; + + #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3'; + + # Dont be concerned with non-reuse type instructions or + # If this operand is in the reuse cache, we don't care what bank it's on. + if (!$reuseType || !exists $slotHist2->{$r2}) + { + # if the operand is also an auto-allocated register then link them + # Once we choose the bank for one we want to update that choice for the other register. + if (ref $regMap->{$r2}) + { + push @{$pairedBanks{$r}{pairs}}, $r2; + $pairedBanks{$r}{banks} ||= []; + } + # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid. + else + { + my $bank = substr($regMap->{$r2},1) & 3; + #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3'; + + $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++; + $pairedBanks{$r}{pairs} ||= []; + } + # Update the total use count for this register. + # This will be the number of times the register is pulled out of the bank. + $pairedBanks{$r}{useCnt}++; + } + } + } + # update the reuse history so we know which bank conflicts we can ignore. + if ($reuseType) + { + # flag these slots for addition or removal from reuseHistory + if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot}) + { $addReuse{$slot} = $r; } + else + { $delReuse{$slot} = $r; } + } + } + # update reuse history after we're done with the instruction (when the flag is actually in effect). + # we don't want to updated it in the middle since that can interfere with the checks, + $reuseHistory{$_}{$addReuse{$_}} = 1 foreach keys %addReuse; + delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse; + + # liveTimes for destination operands and vector registers + foreach my $r0 (getVecRegisters($vectors, $capData)) + { + # fixed register mappings can have aliases so use the actual register value for those. + my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0}; + + # If not writing treat just like a read + if (exists $noDest{$op}) + { + if (my $liveTime = $liveTime{$liveR}) + { + $liveTime->[$#$liveTime][1] = $i; + push @{$liveTime->[$#$liveTime]}, "$i $inst"; + } + else + { + warn "register used without initialization ($r0): $inst\n" unless $nowarn; + push @{$liveTime{$liveR}}, [$i,$i]; + } + } + # If writing, push a new bracket on this register's stack. + elsif (my $liveTime = $liveTime{$liveR}) + { + if ($i > $liveTime->[$#$liveTime][1]) + { + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + else + { + # Initialize the liveTime stack for this register. + push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"]; + } + } + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + #print Dumper(\%liveTime); exit(1); + + # assign unassigned registers + # sort by most restricted, then most used, then name + foreach my $r (sort { + $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} || + $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} || + $a cmp $b + } keys %pairedBanks) + { + my $banks = $pairedBanks{$r}{banks}; + my $avail = $regMap->{$r}; + + #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail; + + # Pick a bank with zero or the smallest number of conflicts + BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3)) + { + # pick an available register that matches the requested bank + foreach my $pos (0 .. $#$avail) + { + if ($bank == ($avail->[$pos] & 3)) + { + # assign it, while removing the assigned register from the pool + $regMap->{$r} = 'R' . splice @$avail, $pos, 1; + + # update bank info for any unassigned pair + $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}}; + last BANK; + } + } + } + } + # Now assign any remaining to first available + foreach my $r (sort keys %$regMap) + { + if (ref($regMap->{$r}) eq 'ARRAY') + { + $regMap->{$r} = 'R' . shift @{$regMap->{$r}}; + } + } + #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap; + + # apply the register mapping and assemble the instructions to op codes + foreach my $i (0 .. $#instructs) + { + #skip control instructions + next unless $i & 3; + + # save the original and replace the register names with numbers + $instructs[$i]{orig} = $instructs[$i]{inst}; + $instructs[$i]{inst} =~ s/(?{$1}) ? $regMap->{$1} : $1 /ge; + + my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)}; + + my $match = 0; + foreach my $gram (@{$grammar{$op}}) + { + # Apply the rule pattern + my $capData = parseInstruct($inst, $gram) or next; + + # update the register count + foreach my $r (qw(r0 r8 r20 r39)) + { + next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ'; + + # get numeric portion of regname + my $val = substr $capData->{$r}, 1; + + my @r0 = getVecRegisters($vectors, $capData); + my @r8 = getAddrVecRegisters($vectors, $capData); + + # smart enough to count vector registers for memory instructions. + my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1; + my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1; + + if ($val + $regInc > $regCnt) + { + $regCnt = $val + $regInc; + #print "$val $regCnt $regInc\n"; + } + } + # update the barrier resource count + if ($op eq 'BAR') + { + if (exists $capData->{i8w4}) + { + $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt; + } + # if a barrier value is a register, assume the maximum + elsif (exists $capData->{r8}) + { + $barCnt = 16; + } + } + # Generate the op code. + my ($code, $reuse) = genCode($op, $gram, $capData); + $instructs[$i]{code} = $code; + + # cache this for final pass when we want to calculate reuse stats. + if ($gram->{type}{reuse}) + { $instructs[$i]{caps} = $capData; } + # use the parsed value of reuse for non-reuse type instructions + else + { $ctrl->{reuse}[($i & 3) - 1] = $reuse; } + + + $match = 1; + last; + } + unless ($match) + { + print "$_->{rule}\n\n" foreach @{$grammar{$op}}; + die "Unable to encode instruction: $inst\n"; + } + } + + # final pass to piece together control codes + my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed); + foreach my $i (0 .. $#instructs) + { + # op code + if ($i & 3) + { + push @codes, $instructs[$i]{code}; + + if ($instructs[$i]{caps}) + { + # calculate stats on registers + registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn); + } + if ($instructs[$i]{inst} =~ m'EXIT') + { + push @exitOffsets, (scalar(@codes)-1)*8; + } + elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)') + { + push @ctaidOffsets, (scalar(@codes)-1)*8; + $ctaidzUsed = 1 if $1 eq 'Z'; + } + } + # control code + else + { + my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)}; + push @codes, + ($ctrl->[0] << 0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes + ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59); # reuse codes + } + } + + # return the kernel data + return { + RegCnt => $regCnt, + BarCnt => $barCnt, + ExitOffsets => \@exitOffsets, + CTAIDOffsets => \@ctaidOffsets, + CTAIDZUsed => $ctaidzUsed, + ConflictCnt => $reuseHistory{conflicts}, + ReuseCnt => $reuseHistory{reuse}, + ReuseTot => $reuseHistory{total}, + ReusePct => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0), + KernelData => \@codes, + }; +} + +# Useful for testing op code coverage of existing code, extracting new codes and flags +sub Test +{ + my ($fh, $printConflicts, $all) = @_; + + my @instructs; + my %reuseHistory; + my ($pass, $fail) = (0,0); + + while (my $line = <$fh>) + { + my (@ctrl, @reuse); + + next unless processSassCtrlLine($line, \@ctrl, \@reuse); + + foreach my $fileReuse (@reuse) + { + $line = <$fh>; + + my $inst = processSassLine($line) or next; + + $inst->{reuse} = $fileReuse; + my $fileCode = $inst->{code}; + + if (exists $relOffset{$inst->{op}}) + { + # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump + $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e; + } + + my $match = 0; + foreach my $gram (@{$grammar{$inst->{op}}}) + { + my $capData = parseInstruct($inst->{inst}, $gram) or next; + my @caps; + + # Run in test mode to list what capture groups were captured + my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps); + + # Detect register bank conflicts but only for reuse type instructions. + # If a bank conflict is avoided by a reuse flag then ignore it. + registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse}; + + $inst->{caps} = join ', ', sort @caps; + $inst->{codeDiff} = $fileCode ^ $code; + $inst->{reuseDiff} = $fileReuse ^ $reuse; + + # compare calculated and file values + if ($code == $fileCode && $reuse == $fileReuse) + { + $inst->{grade} = 'PASS'; + push @instructs, $inst if $all; + $pass++; + } + else + { + $inst->{grade} = 'FAIL'; + push @instructs, $inst; + $fail++; + } + $match = 1; + last; + } + unless ($match) + { + $inst->{grade} = 'FAIL'; + $inst->{codeDiff} = $fileCode; + $inst->{reuseDiff} = $fileReuse; + push @instructs, $inst; + $fail++; + } + } + } + my %maxLen; + foreach (@instructs) + { + $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}}; + } + my ($lastOp, $template); + foreach my $inst (sort { + $a->{op} cmp $b->{op} || + $a->{codeDiff} <=> $b->{codeDiff} || + $a->{reuseDiff} <=> $b->{reuseDiff} || + $a->{ins} cmp $b->{ins} + } @instructs) + { + if ($lastOp ne $inst->{op}) + { + $lastOp = $inst->{op}; + $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s %s\n"; + printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures); + } + printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)}; + } + my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0; + + printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n", + $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total}; + + return $fail; +} + +# Convert cuobjdump sass to the working format +sub Extract +{ + my ($in, $out, $params) = @_; + + my %paramMap; + my %constants = + ( + blockDimX => 'c[0x0][0x8]', + blockDimY => 'c[0x0][0xc]', + blockDimZ => 'c[0x0][0x10]', + gridDimX => 'c[0x0][0x14]', + gridDimY => 'c[0x0][0x18]', + gridDimZ => 'c[0x0][0x1c]', + ); + print $out "\n"; + + foreach my $const (sort keys %constants) + { + print $out " $const : $constants{$const}\n"; + $paramMap{$constants{$const}} = $const; + } + print $out "\n"; + + foreach my $p (@$params) + { + my ($ord,$offset,$size,$align) = split ':', $p; + + if ($size > 4) + { + my $num = 0; + $offset = hex $offset; + while ($size > 0) + { + my $param = sprintf 'param_%d[%d]', $ord, $num; + my $const = sprintf 'c[0x0][0x%x]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + $size -= 4; + $offset += 4; + $num += 1; + } + } + else + { + my $param = sprintf 'param_%d', $ord; + my $const = sprintf 'c[0x0][%s]', $offset; + $paramMap{$const} = $param; + print $out " $param : $const\n"; + } + } + print $out "\n\n"; + + my %labels; + my $labelnum = 1; + + my @data; + FILE: while (my $line = <$in>) + { + my (@ctrl, @ruse); + next unless processSassCtrlLine($line, \@ctrl, \@ruse); + + CTRL: foreach my $ctrl (@ctrl) + { + $line = <$in>; + + my $inst = processSassLine($line) or next CTRL; + + # Convert branch/jump/call addresses to labels + if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)') + { + my $target = hex($1); + + # skip the final BRA and stop processing the file + last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8); + + # check to see if we've already generated a label for this target address + my $label = $labels{$target}; + unless ($label) + { + # generate a label name and cache it + $label = $labels{$target} = "TARGET$labelnum"; + $labelnum++; + } + # replace address with name + $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/; + } + $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg; + + $inst->{ctrl} = printCtrl($ctrl); + + push @data, $inst; + } + } + # make a second pass now that we have the complete instruction address to label mapping + foreach my $inst (@data) + { + print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}}; + printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)}; + } +} + +my $CommentRe = qr'^[\t ]*.*?^\s*\n?'ms; +my $IncludeRe = qr'^[\t ]*\n?'ms; +my $CodeRe = qr'^[\t ]*(.*?)^\s*<\/CODE\1>\n?'ms; +my $ConstMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $RegMapRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $ScheduleRe = qr'^[\t ]*(.*?)^\s*\n?'ms; +my $InlineRe = qr'\[(\+|\-)(.+?)\1\]'ms; + +sub IncludeFile +{ + my ($file, $include) = @_; + my ($vol,$dir,$name) = File::Spec->splitpath($file); + local $/; + my $fh; + if (!open $fh, $file) + { + open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n"; + } + my $content = <$fh>; + close $fh; + return $content; +} + +sub Preprocess +{ + my ($file, $include, $debug, $regMap) = @_; + + my $constMap = {}; + my $removeRegMap; + if ($regMap) + { $removeRegMap = 1; } + else + { $regMap = {}; } + + # include nested files + 1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg; + + # Strip out comments + $file =~ s|$CommentRe||g; + + # Execute the CODE sections (old way to run code, to be deprecated) + 1 while $file =~ s|$CodeRe| + my $out = eval "package PascalAs::PascalAs::CODE; $2"; + $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg; + + # Execute the inline code (new way) + $file =~ s|$InlineRe| + my ($type, $code) = ($1, $2); + my $out = eval "package PascalAs::PascalAs::CODE; $code"; + $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg; + + #Pull in the constMap + $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg; + + my @newFile; + foreach my $line (split "\n", $file) + { + # skip comments + if ($line !~ m'^\s*(?:#|//).*') + { + $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg; + } + push @newFile, $line; + } + $file = join "\n", @newFile; + + # Pull in the reg map first as the Scheduler will need it to handle vector instructions + # Remove the regmap if we're going on to assemble + $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg; + + # Pick out the SCHEDULE_BLOCK sections + my @schedBlocks = $file =~ /$ScheduleRe/g; + + # Schedule them + foreach my $i (0 .. $#schedBlocks) + { + # XMAD macros should only appear in SCHEDULE_BLOCKs + $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]); + + $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug); + } + + # Replace the results + $file =~ s|$ScheduleRe| shift @schedBlocks |eg; + + return $file; +} + +# break the registers down into source and destination categories for the scheduler +my %srcReg = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X); +my %destReg = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC); +my %regops = (%srcReg, %destReg); +my @itypes = qw(class lat rlat tput dual); + +sub Scheduler +{ + my ($block, $blockNum, $regMap, $debug) = @_; + + my $vectors = $regMap->{__vectors}; + my $lineNum = 0; + + my (@instructs, @comments, $ordered, $first); + foreach my $line (split "\n", $block) + { + # keep track of line nums in the physical file + $lineNum++; + + unless (preProcessLine($line)) + { + push @comments, $line if $line =~ m'\S'; + next; + } + + # match an instruction + if (my $inst = processAsmLine($line, $lineNum)) + { + # if the first instruction in the block is waiting on a dep, it should go first. + $inst->{first} = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1; + + # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block) + #$inst->{first} = $inst->{ctrl} & 0x0000f ? 1 : 2; + $inst->{exeTime} = 0; + $inst->{order} = $ordered++ if $ordered; + push @instructs, $inst; + } + # match a label + elsif ($line =~ m'^([a-zA-Z]\w*):') + { + die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n"; + } + # open an ORDERED block + elsif ($line =~ m'^') + { + die "you cannot use nested tags" if $ordered; + $ordered = 1; + } + # close an ORDERED block + elsif ($line =~ m'^') + { + die "missing opening for closing tag" if !$ordered; + $ordered = 0; + } + else + { + die "badly formed line at block: $blockNum line: $lineNum: $line\n"; + } + } + + my (%writes, %reads, @ready, @schedule, $orderedParent); + # assemble the instructions to op codes + foreach my $instruct (@instructs) + { + my $match = 0; + foreach my $gram (@{$grammar{$instruct->{op}}}) + { + my $capData = parseInstruct($instruct->{inst}, $gram) or next; + my (@dest, @src); + + # copy over instruction types for easier access + @{$instruct}{@itypes} = @{$gram->{type}}{@itypes}; + + # A predicate prefix is treated as a source reg + push @src, $instruct->{predReg} if $instruct->{pred}; + + # Handle P2R and R2P specially + if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7}) + { + my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src; + my $mask = hex($capData->{i20w7}); + foreach my $p (0..6) + { + if ($mask & (1 << $p)) + { + push @$list, "P$p"; + } + # make this instruction dependent on any predicates it's not setting + # this is to prevent a race condition for any predicate sets that are pending + elsif ($instruct->{op} eq 'R2P') + { + push @src, "P$p"; + } + } + # These instructions can't be dual issued + $instruct->{nodual} = 1; + } + + # Populate our register source and destination lists, skipping any zero or true values + foreach my $operand (grep { exists $regops{$_} } sort keys %$capData) + { + # figure out which list to populate + my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src; + + # Filter out RZ and PT + my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT'; + + if ($capData->{$operand} ne $badVal) + { + # add the value to list with the correct prefix + push @$list, + $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) : + $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) : + $operand eq 'CC' ? 'CC' : + $operand eq 'X' ? 'CC' : + getRegNum($regMap, $capData->{$operand}); + } + } + $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39}); + + # Find Read-After-Write dependencies + foreach my $src (grep { exists $writes{$_} } @src) + { + # Memory operations get delayed access to registers but not to the predicate + my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat}; + + # the parent should be the most recently added dest op to the stack + foreach my $parent (@{$writes{$src}}) + { + # add this instruction as a child of the parent + # set the edge to the total latency of reg source availability + #print "R $parent->{inst}\n\t\t$instruct->{inst}\n"; + my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat}; + push @{$parent->{children}}, [$instruct, $latency - $regLatency]; + $instruct->{parents}++; + + # if the destination was conditionally executed, we also need to keep going back till it wasn't + last unless $parent->{pred}; + } + } + + # Find Write-After-Read dependencies + foreach my $dest (grep { exists $reads{$_} } @dest) + { + # Flag this instruction as dependent to any previous read + foreach my $reader (@{$reads{$dest}}) + { + # no need to stall for these types of dependencies + #print "W $reader->{inst} \t\t\t $instruct->{inst}\n"; + push @{$reader->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + # Once dependence is marked we can clear out the read list (unless this write was conditional). + # The assumption here is that you would never want to write out a register without + # subsequently reading it in some way prior to writing it again. + delete $reads{$dest} unless $instruct->{pred}; + } + + # Enforce instruction ordering where requested + if ($instruct->{order}) + { + if ($orderedParent) + { + push @{$orderedParent->{children}}, [$instruct, 0]; + $instruct->{parents}++; + } + $orderedParent = $instruct; + } + elsif ($orderedParent) + { $orderedParent = 0; } + + # For a dest reg, push it onto the write stack + unshift @{$writes{$_}}, $instruct foreach @dest; + + # For a src reg, push it into the read list + push @{$reads{$_}}, $instruct foreach @src; + + # if this instruction has no dependencies it's ready to go + push @ready, $instruct if !exists $instruct->{parents}; + + $match = 1; + last; + } + die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match; + } + %writes = (); + %reads = (); + + if (@ready) + { + # update dependent counts for sorting hueristic + my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" }; + + countUniqueDescendants($readyParent, {}); + updateDepCounts($readyParent, {}); + + # sort the initial ready list + @ready = sort { + $a->{first} <=> $b->{first} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + # Process the ready list, adding new instructions to the list as we go. + my $clock = 0; + while (my $instruct = shift @ready) + { + my $stall = $instruct->{stall}; + + # apply the stall to the previous instruction + if (@schedule && $stall < 16) + { + my $prev = $schedule[$#schedule]; + + # if stall is greater than 4 then also yield + # the yield flag is required to get stall counts 12-15 working correctly. + $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0; + $prev->{ctrl} |= $stall; + $clock += $stall; + } + # For stalls bigger than 15 we assume the user is managing it with a barrier + else + { + $instruct->{ctrl} &= 0x1fff0; + $instruct->{ctrl} |= 1; + $clock += 1; + } + print "$clock: $instruct->{inst}\n" if $debug; + + # add a new instruction to the schedule + push @schedule, $instruct; + + # update each child with a new earliest execution time + if (my $children = $instruct->{children}) + { + foreach (@$children) + { + my ($child, $latency) = @$_; + + # update the earliest clock value this child can safely execute + my $earliest = $clock + $latency; + $child->{exeTime} = $earliest if $child->{exeTime} < $earliest; + + print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug; + + # decrement parent count and add to ready queue if none remaining. + push @ready, $child if --$child->{parents} < 1; + } + delete $instruct->{children}; + } + + # update stall and mix values in the ready queue on each iteration + foreach my $ready (@ready) + { + # calculate how many instructions this would cause the just added instruction to stall. + $stall = $ready->{exeTime} - $clock; + $stall = 1 if $stall < 1; + + # if using the same compute resource as the prior instruction then limit the throughput + if ($ready->{class} eq $instruct->{class}) + { + $stall = $ready->{tput} if $stall < $ready->{tput}; + } + # dual issue with a simple instruction (tput <= 2) + # can't dual issue two instructions that both load a constant + elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} && + $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const})) + { + $stall = 0; + } + $ready->{stall} = $stall; + + # add an instruction class mixing huristic that catches anything not handled by the stall + $ready->{mix} = $ready->{class} ne $instruct->{class} || 0; + } + + # sort the ready list by stall time, mixing huristic, dependencies and line number + @ready = sort { + $a->{first} <=> $b->{first} || + $a->{stall} <=> $b->{stall} || + $b->{mix} <=> $a->{mix} || + $b->{deps} <=> $a->{deps} || + $a->{lineNum} <=> $b->{lineNum} + } @ready; + + if ($debug) + { + print "\tf,ext,stl,mix,dep,lin, inst\n"; + printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready; + } + } + + my $out; + #$out .= "$_\n" foreach @comments; + $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule; + return $out; +} + +sub setConstMap +{ + my ($constMap, $constMapText) = @_; + + foreach my $line (split "\n", $constMapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my ($name, $value) = split '\s*:\s*', $line; + + $constMap->{$name} = $value; + } + return; +} + +sub setRegisterMap +{ + my ($regMap, $regmapText) = @_; + + my $vectors = $regMap->{__vectors} ||= {}; + my $regBank = $regMap->{__regbank} ||= {}; + my %aliases; + + foreach my $line (split "\n", $regmapText) + { + # strip leading space + $line =~ s|^\s+||; + # strip comments + $line =~ s{(?:#|//).*}{}; + # strip trailing space + $line =~ s|\s+$||; + # skip blank lines + next unless $line =~ m'\S'; + + my $auto = $line =~ /~/; + my $share = $line =~ /=/; + + my ($regNums, $regNames) = split '\s*[:~=]\s*', $line; + + my (@numList, @nameList, %vecAliases); + foreach my $num (split '\s*,\s*', $regNums) + { + my ($start, $stop) = split '\s*\-\s*', $num; + die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop; + push @numList, ($start .. $stop||$start); + } + foreach my $fullName (split '\s*,\s*', $regNames) + { + if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$') + { + my ($name1, $name2, $bank) = ($1, $3, $4); + foreach (split '\s*\|\s*', $2) + { + my ($start, $stop) = split '\s*\-\s*'; + foreach my $r (map "$name1$_$name2", $start .. $stop||$start) + { + # define an alias for use in vector instructions that omits the number portion + $aliases{$r} = "$name1$name2" unless exists $aliases{$r}; + push @nameList, $r; + $regBank->{$r} = $bank if $auto && defined $bank; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank; + } + } + } + elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$') + { + push @nameList, $1; + $regBank->{$1} = $2 if $auto && defined $2; + warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2; + } + else + { + die "Bad register name: '$fullName' at: $line\n"; + } + } + die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList; + die "Missmatched register mapping at: $line\n" if $share && @numList > 1; + + # detect if this list is monotonically ascending with no gaps + my $i = 0; + while ($i < $#numList-1) + { + last if $numList[$i] + 1 != $numList[$i+1]; + $i++; + } + my $ascending = $i+1 == $#numList; + + foreach my $n (0..$#nameList) + { + die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]}; + + if ($auto) + { + # assign possible values to be assigned on assembly + $regMap->{$nameList[$n]} = \@numList; + } + elsif ($share) + { + # each name shares the same single register + $regMap->{$nameList[$n]} = 'R' . $numList[0]; + } + else + { + $regMap->{$nameList[$n]} = 'R' . $numList[$n]; + # flag any even register as a potential vector + if ($ascending && ($numList[$n] & 1) == 0) + { + # constrain potential range to vector alignment + my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3); + if ($end <= $#nameList) + { + $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ]; + #setup an alias for the base name without the number + if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}}) + { + $regMap->{$aliases{$nameList[$n]}} = $regMap->{$nameList[$n]}; + $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]}; + delete $aliases{$nameList[$n]}; + } + } + } + } + } + } + #print Dumper($regMap); exit(1); +} + +sub preProcessLine +{ + # strip leading space + $_[0] =~ s|^\s+||; + + # preserve comment but check for emptiness + my $val = shift; + + # strip comments + $val =~ s{(?:#|//).*}{}; + + # skip blank lines + return $val =~ m'\S'; +} + +# traverse the graph and count total descendants per node. +# only count unique nodes (by lineNum) +sub countUniqueDescendants +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + + $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges); + } + } + else + { + return $node->{lineNum}; + } + return ($node->{lineNum}, keys %{$node->{deps}}); +} +# convert hash to count for easier sorting. +sub updateDepCounts +{ + my ($node, $edges) = @_; + + #warn "$node->{inst}\n"; + + if (my $children = $node->{children}) + { + foreach my $child (@$children) + { + next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++; + updateDepCounts($child->[0], $edges); + } + } + $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0; +} + +# Detect register bank conflicts and calculate reuse stats +sub registerHealth +{ + my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_; + + my (@banks, @conflicts); + + foreach my $slot (qw(r8 r20 r39)) + { + my $r = $capData->{$slot} or next; + next if $r eq 'RZ'; + + my $slotHist = $reuseHistory->{$slot} ||= {}; + + $reuseHistory->{total}++; + + # if this register is in active reuse then ignore for bank conflict checking. + if (exists $slotHist->{$r}) + { + $reuseHistory->{reuse}++; + } + else + { + # extract number from reg and take the modulo-4 value. This is the bank id. + my $bank = substr($r,1) & 3; + + # check for conflict + if ($banks[$bank] && $banks[$bank] ne $r) + { + push @conflicts, $banks[$bank] if !@conflicts; + push @conflicts, $r; + + $reuseHistory->{conflicts}++; + } + $banks[$bank] = $r; + } + + # update the history + if ($reuseFlags & $reuseSlots{$slot}) + { $slotHist->{$r} = 1; } + else + { delete $slotHist->{$r}; } + } + if ($inst && @conflicts && !$nowarn) + { + printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts); + } + return scalar @conflicts; +} + +1; + +__END__ + +=head1 NAME + +PascalAs::PascalAs - Assembler for NVIDIA Maxwell architecture + +=head1 SYNOPSIS + + Pascalas.pl [opts] + +=head1 DESCRIPTION + +See the documentation at: https://github.com/NervanaSystems/pascalas + +=head1 SEE ALSO + +See the documentation at: https://github.com/NervanaSystems/pascalas + + +=head1 AUTHOR + +Scott Gray, Esgray@nervanasys.com + +=head1 COPYRIGHT AND LICENSE + +The MIT License (MIT) + +Copyright (c) 2014 Scott Gray + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +=cut diff --git a/Assembler/PascalAs/lib/PascalAs/PascalAsGrammar.pm b/Assembler/PascalAs/lib/PascalAs/PascalAsGrammar.pm new file mode 100644 index 0000000..bf25fb8 --- /dev/null +++ b/Assembler/PascalAs/lib/PascalAs/PascalAsGrammar.pm @@ -0,0 +1,1437 @@ +package PascalAs::PascalAsGrammar; + +use strict; +use Carp; +use Exporter; +use Data::Dumper; +our @ISA = qw(Exporter); + +our @EXPORT = qw( + %grammar %flags + parseInstruct genCode genReuseCode + processAsmLine processSassLine processSassCtrlLine + replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters +); + +require 5.10.0; + +# Helper functions for operands +sub getI +{ + my ($orig, $pos, $mask) = @_; + my $val = $orig; + my $neg = $val =~ s|^\-||; + + # parse out our custom index immediates for addresses + if ($val =~ m'^(\d+)[xX]<([^>]+)>') + { + # allow any perl expression and multiply result by leading decimal. + # also allow global scalar varibles in the expression. + my $mul = $1; + my $exp = $2; + # strip leading zeros (don't interpret numbers as octal) + $exp =~ s/(?> $trunc) & 0x7ffff if $trunc; + } + return $val << $pos; +} +sub getR +{ + my ($val, $pos) = @_; + if ($val =~ m'^R(\d+|Z)$' && $1 < 255) + { + $val = $1 eq 'Z' ? 0xff : $1; + } + else + { + die "Bad register name found: $val\n"; + } + return $val << $pos; +} +sub getP +{ + my ($val, $pos) = @_; + if ($val =~ m'^P(\d|T)$' && $1 < 7) + { + $val = $1 eq 'T' ? 7 : $1; + } + else + { + die "Bad predicate name found: $val\n"; + } + return $val << $pos; +} +sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 } + +# Map operands into their value and position in the op code. +my %operands = +( + p0 => sub { getP($_[0], 0) }, + p3 => sub { getP($_[0], 3) }, + p12 => sub { getP($_[0], 12) }, + p29 => sub { getP($_[0], 29) }, + p39 => sub { getP($_[0], 39) }, + p45 => sub { getP($_[0], 45) }, + p48 => sub { getP($_[0], 48) }, + p58 => sub { getP($_[0], 58) }, + r0 => sub { getR($_[0], 0) }, + r8 => sub { getR($_[0], 8) }, + r20 => sub { getR($_[0], 20) }, + r28 => sub { getR($_[0], 28) }, + r39s20 => sub { getR($_[0], 39) }, + r39 => sub { getR($_[0], 39) }, + r39a => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20 + c20 => sub { getC($_[0]) }, + c39 => sub { getC($_[0]) }, + c34 => sub { hex($_[0]) << 34 }, + c36 => sub { hex($_[0]) << 36 }, + f20w32 => sub { getF($_[0], 20, 'f') }, + f20 => sub { getF($_[0], 20, 'f', 12) }, + d20 => sub { getF($_[0], 20, 'd', 44) }, + i8w4 => sub { getI($_[0], 8, 0xf) }, + i20 => sub { getI($_[0], 20, 0x7ffff) }, + i20w6 => sub { getI($_[0], 20, 0x3f) }, + i20w7 => sub { getI($_[0], 20, 0x7f) }, + i20w8 => sub { getI($_[0], 20, 0xff) }, + i20w12 => sub { getI($_[0], 20, 0xfff) }, + i20w24 => sub { getI($_[0], 20, 0xffffff) }, + i20w32 => sub { getI($_[0], 20, 0xffffffff) }, + i31w4 => sub { getI($_[0], 31, 0xf) }, + i34w13 => sub { getI($_[0], 34, 0x1fff) }, + i36w20 => sub { getI($_[0], 36, 0xfffff) }, + i39w8 => sub { getI($_[0], 39, 0xff) }, + i28w8 => sub { getI($_[0], 28, 0xff) }, + i28w20 => sub { getI($_[0], 28, 0xfffff) }, + i48w8 => sub { getI($_[0], 48, 0xff) }, + i51w5 => sub { getI($_[0], 51, 0x1f) }, + i53w5 => sub { getI($_[0], 53, 0x1f) }, +); + +# Rules for operands and their closely tied flags +my $hex = qr"0[xX][0-9a-fA-F]+"; +my $iAddr = qr"\d+[xX]<[^>]+>"; +my $immed = qr"$hex|$iAddr|\d+"o; +my $reg = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\ +my $p = qr"P[0-6T]"; +my $noPred = qr"(?)"; +my $pred = qr"\@(?\!)?P(?[0-6]) "; +my $p0 = qr"(?$p)"o; +my $p3 = qr"(?$p)"o; +my $p12 = qr"(?\!)?(?$p)"o; +my $p29 = qr"(?\!)?(?$p)"o; +my $p39 = qr"(?\!)?(?$p)"o; +my $p45 = qr"(?$p)"o; +my $p48 = qr"(?$p)"o; +my $p58 = qr"(?$p)"o; +my $r0 = qr"(?$reg)"; +my $r0cc = qr"(?$reg)(?\.CC)?"; +my $r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?\.reuse)?"; +my $r28 = qr"(?$reg)"; +my $r39s20 = qr"(?\-)?(?\|)?(?(?$reg))\|?(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39 = qr"(?\-)?(?$reg)(?:\.(?H0|H1))?(?\.reuse)?"; +my $r39a = qr"(?(?$reg))(?\.reuse)?"; +my $c20 = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20x = qr"(?\-)?(?\|)?c\[(?$hex)\]\s*\[(?$hex)\]\|?(?:\.(?H0|H1|B0|B1|B2|B3))?"o; +my $c20s39 = qr"(?\-)?c\[(?$hex)\]\s*\[(?$hex)\]"o; +my $f20w32 = qr"(?(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))"; +my $f20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $d20 = qr"(?(?:(?\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?\.NEG)?"o; +my $i8w4 = qr"(?$immed)"o; +my $i20 = qr"(?(?\-)?$immed)(?\.NEG)?"o; +my $i20w6 = qr"(?$immed)"o; +my $i20w7 = qr"(?$immed)"o; +my $i20w8 = qr"(?$immed)"o; +my $i20w12 = qr"(?$immed)"o; +my $i20w24 = qr"(?\-?$immed)"o; +my $i20w32 = qr"(?\-?$immed)"o; +my $i39w8 = qr"(?\-?$immed)"o; +my $i28w8 = qr"(?$immed)"o; +my $i28w20 = qr"(?\-?$immed)"o; +my $i31w4 = qr"(?$immed)"o; +my $i34w13 = qr"(?$immed)"o; +my $i36w20 = qr"(?$immed)"o; +my $i48w8 = qr"(?$immed)"o; +my $i51w5 = qr"(?$immed)"o; +my $i53w5 = qr"(?$immed)"o; +my $ir20 = qr"$i20|$r20"o; +my $cr20 = qr"$c20|$r20"o; +my $icr20 = qr"$i20|$c20|$r20"o; +my $fcr20 = qr"$f20|$c20|$r20"o; +my $cr39 = qr"$c20s39|$r39"o; +my $dr20 = qr"$d20|$r20"o; + +# Instruction specific rules for capturing various flags +my $u32 = qr"(?\.U32)?"; +my $ftz = qr"(?\.FTZ)?"; +my $sat = qr"(?\.SAT)?"; +my $rnd = qr"(?:\.(?RN|RM|RP|RZ))?"; +my $round = qr"(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?"; +my $fcmp = qr"(?\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)"; +my $icmp = qr"\.(?LT|EQ|LE|GT|NE|GE)"; +my $bool = qr"\.(?AND|OR|XOR|PASS_B)"; +my $bool2 = qr"\.(?AND|OR|XOR)"; +my $func = qr"\.(?COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)"; +my $rro = qr"\.(?SINCOS|EX2)"; +my $add3 = qr"(?:\.(?X|RS|LS))?"; +my $lopz = qr"(?:\.(?NZ|Z) $p48,|(?))"o; +my $X = qr"(?\.X)?"; +my $tld = qr"(?NODEP\.)?(?:(?T)|(?P))"; +my $chnls = qr"(?R|RGBA)"; +my $sr = qr"SR_(?\S+)"; +my $shf = qr"(?\.W)?(?:\.(?U64|S64))?(?\.HI)?"; +my $xmad = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $xmadc = qr"(?:\.(?U16|S16))?(?:\.(?U16|S16))?(?:\.(?MRG|PSL|CHI|CLO|CSFU))?(?\.CBCC)?"; +my $vmad8 = qr"\.(?[SU])(?8|16)\.(?[SU])(?8|16)(?\.PO)?(?\.SHR_7)?(?\.SHR_15)?(?\.SAT)?"; +my $vmad16= qr"\.(?[SU])(?16)\.(?[SU])(?16)"; +my $hilo = qr"(?:\.(?XHI|XLO))?"; +my $vaddType = qr"(?:\.(?UD))?(?:\.(?SD))?(?:\.(?[SU])(?8|16|32))?(?:\.(?[SU])(?8|16|32))?"; +my $vaddMode = qr"(?:\.(?MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?"; +my $vmnmx = qr"(?:\.(?MX))?"; +my $x2x = qr"\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)"; +my $prmt = qr"(?:\.(?F4E|B4E|RC8|ECL|ECR|RC16))?"; +my $shfl = qr"\.(?IDX|UP|DOWN|BFLY)"; +my $bar = qr"\.(?SYNC|ARV|RED)(?:\.(?POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?()|(?))(?(), $p39|(?))"o; +my $b2r = qr"\.RESULT $r0(?:, $p45|(?))"o; +my $dbar = qr"(?SB0|SB1|SB2|SB3|SB4|SB5)"; +my $dbar2 = qr" {(?5)?,?(?4)?,?(?3)?,?(?2)?,?(?1)?,?(?0)?}"; +my $mbar = qr"\.(?CTA|GL|SYS)"; +my $addr = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i20w24)?\]"o; +my $addr2 = qr"\[(?:(?$reg)|(?))(?:\s*\+?\s*$i28w20)?\]"o; +my $ldc = qr"c\[(?$hex)\]\s*$addr"o; +my $atom = qr"(?\.E)?(?:\.(?ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)"; +my $vote = qr"\.(?ALL|ANY|EQ)"o; +my $memType = qr"(?\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)"; +my $memCache = qr"(?\.E)?(?\.U)?(?:\.(?CG|CI|CS|CV|IL|WT))?"; + + + +# class: hardware resource that shares characteristics with types +# lat : pipeline depth where relevent, placeholder for memory ops +# blat : barrier latency, typical fetch time for memory operations. Highly variable. +# rlat : operand read latency for memory ops +# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op. +# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession. +# dual : whether this instruction type can be dual issued +# reuse: whether this instruction type accepts register reuse flags. + +# Some of these values are guesses and need to be updated from micro benchmarks. +# We may need to split these classes up further. +my $s2rT = {class => 's2r', lat => 2, blat => 25, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $smemT = {class => 'mem', lat => 2, blat => 30, rlat => 2, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $gmemT = {class => 'mem', lat => 2, blat => 200, rlat => 4, rhold => 20, tput => 1, dual => 1, reuse => 0}; +my $x32T = {class => 'x32', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 1}; +my $x64T = {class => 'x64', lat => 2, blat => 128, rlat => 0, rhold => 0, tput => 128, dual => 0, reuse => 1}; +my $shftT = {class => 'shift', lat => 6, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $cmpT = {class => 'cmp', lat => 13, blat => 0, rlat => 0, rhold => 0, tput => 2, dual => 0, reuse => 1}; +my $qtrT = {class => 'qtr', lat => 8, blat => 0, rlat => 4, rhold => 0, tput => 1, dual => 1, reuse => 0}; +my $rroT = {class => 'rro', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; +my $voteT = {class => 'vote', lat => 2, blat => 0, rlat => 0, rhold => 0, tput => 1, dual => 0, reuse => 0}; + + +# Create map of op names to rules +our %grammar = +( + #Floating Point Instructions + FADD => [ { type => $x32T, code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FADD32I => [ { type => $x32T, code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o, } ], + FCHK => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o, } ], #Partial? + FCMP => [ { type => $cmpT, code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o, } ], + FFMA => [ + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o, }, + { type => $x32T, code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o, }, + ], + FMNMX => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o, } ], + FMUL => [ { type => $x32T, code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o, } ], + FMUL32I => [ { type => $x32T, code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o, } ], + FSET => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o, } ], + FSETP => [ { type => $cmpT, code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], + MUFU => [ { type => $qtrT, code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o, } ], + RRO => [ { type => $rroT, code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o, } ], + DADD => [ { type => $x64T, code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o, } ], + DFMA => [ { type => $x64T, code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o, } ], + DMNMX => [ { type => $cmpT, code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o, } ], + DMUL => [ { type => $x64T, code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o, } ], + DSET => [ { type => $cmpT, code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o, } ], + DSETP => [ { type => $cmpT, code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o, } ], + FSWZADD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o, } ], #TODO + + HADD2 => [ { type => $x32T, code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o, } ], + HMUL2 => [ { type => $x32T, code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o, } ], + HFMA2 => [ { type => $x32T, code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o, } ], + HSETP2 => [ { type => $cmpT, code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial + + #Integer Instructions + BFE => [ { type => $shftT, code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o, } ], + BFI => [ { type => $shftT, code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o, } ], + FLO => [ { type => $s2rT, code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o, } ], + IADD => [ { type => $x32T, code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o, } ], + IADD32I => [ { type => $x32T, code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o, } ], + IADD3 => [ { type => $x32T, code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o, } ], + ICMP => [ { type => $cmpT, code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o, } ], + IMNMX => [ { type => $shftT, code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o, } ], + ISET => [ { type => $shftT, code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o, } ], + ISETP => [ { type => $cmpT, code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ], + ISCADD => [ { type => $shftT, code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o, } ], + ISCADD32I => [ { type => $shftT, code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o, } ], + LEA => [ + { type => $cmpT, code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o, }, + { type => $shftT, code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o, }, + { type => $shftT, code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o, }, + { type => $shftT, code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o, }, + ], + LOP => [ { type => $x32T, code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?~)?$icr20(?\.INV)?;"o, } ], + LOP32I => [ { type => $x32T, code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o, } ], + LOP3 => [ + { type => $x32T, code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o, }, + { type => $x32T, code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o, }, + ], + POPC => [ { type => $s2rT, code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o, } ], + SHF => [ + { type => $shftT, code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o, }, + { type => $shftT, code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o, }, + ], + SHL => [ { type => $shftT, code => 0x5c48000000000000, rule => qr"^$pred?SHL(?\.W)? $r0, $r8, $icr20;"o, } ], + SHR => [ { type => $shftT, code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o, } ], + XMAD => [ + { type => $x32T, code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o, }, + { type => $x32T, code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o, }, + { type => $x32T, code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o, }, + ], + # XMAD replaces these + IMAD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o, } ], #TODO + IMADSP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO + IMUL => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o, } ], #TODO + + #Conversion Instructions + F2F => [ { type => $qtrT, code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ], + F2I => [ { type => $qtrT, code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o, } ], + I2F => [ { type => $qtrT, code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o, } ], + I2I => [ { type => $qtrT, code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o, } ], + + #Movement Instructions + MOV => [ { type => $x32T, code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o, } ], + MOV32I => [ { type => $x32T, code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o, } ], + PRMT => [ { type => $x32T, code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ], + SEL => [ { type => $x32T, code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o, } ], + SHFL => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ], + + #Predicate/CC Instructions + PSET => [ { type => $cmpT, code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o, } ], + PSETP => [ { type => $cmpT, code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ], + CSET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o, } ], #TODO + CSETP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO + P2R => [ { type => $x32T, code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o, } ], + R2P => [ { type => $cmpT, code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o, } ], + + #Texture Instructions + # Handle the commonly used 1D texture functions.. but save the others for later + TLD => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial + TLDS => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial + TEX => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o, } ], #TODO + TLD4 => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o, } ], #TODO + TXQ => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o, } ], #TODO + TEXS => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o, } ], #TODO + TLD4S => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO + + #Compute Load/Store Instructions + LD => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o, } ], + ST => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o, } ], + LDG => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o, } ], + STG => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o, } ], + LDS => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o, } ], + STS => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o, } ], + LDL => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o, } ], + STL => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o, } ], + LDC => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o, } ], + # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded). + ATOM => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + ATOMS => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o, } ], + RED => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o, } ], + CCTL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o, } ], #TODO + CCTLL => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO + CCTLT => [ { type => $x32T, code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO + + #Surface Memory Instructions (haven't gotten to these yet..) + SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO + SULD => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o, } ], #TODO + SURED => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o, } ], #TODO + SUST => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o, } ], #TODO + + #Control Instructions + BRA => [ + { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?\.U)? $i20w24;"o, }, + { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?\.U)? CC\.EQ, $i20w24;"o, }, + ], + BRX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o, } ], #TODO + JMP => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o, } ], #TODO + JMX => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o, } ], #TODO + SSY => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o, } ], + SYNC => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o, } ], + CAL => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o, } ], + JCAL => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o, } ], + PRET => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o, } ], #TODO + RET => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o, } ], + BRK => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o, } ], + PBK => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o, } ], + CONT => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o, } ], + PCNT => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o, } ], + EXIT => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o, } ], + PEXIT => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o, } ], #TODO + BPT => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o, } ], + + #Miscellaneous Instructions + NOP => [ { type => $x32T, code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o, } ], + CS2R => [ { type => $x32T, code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o, } ], + S2R => [ { type => $s2rT, code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o, } ], + B2R => [ { type => $x32T, code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o, } ], + BAR => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o, } ], + DEPBAR => [ + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, }, + { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o, }, + ], + MEMBAR => [ { type => $x32T, code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o, } ], + VOTE => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?))$p45, $p39;"o, } ], + R2B => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o, } ], #TODO + + #Video Instructions... Need to finish + VADD => [ { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMAD => [ + { type => $x32T, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, }, + { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, }, + ], + VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + VMNMX => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 + + VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000 +); + +# Create map of capture groups to op code flags that need to be added (or removed) +my @flags = grep /\S/, split "\n", q{; + +BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD +0x0100000000000000 neg + +FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP +0x0100000000000000 neg + +PSET, PSETP +0x0000000000008000 p12not +0x0000000100000000 p29not + +FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE +0x0000040000000000 p39not + +IADD, IADD3, XMAD, LEA, IMNMX +0x0000800000000000 CC + +IADD32I +0x0010000000000000 CC + +LEA +0x0000000000000000 X + +SHF +0x0004000000000000 W +0x0001000000000000 HI + +SHF: type +0x0000004000000000 U64 +0x0000006000000000 S64 + +SHR, IMNMX, ISETP, ISET, ICMP, BFE +0x0001000000000000 U32 + +SHL +0x0000008000000000 W + +SHFL +0x0000000010000000 i20w8 +0x0000000020000000 i34w13 + +SHFL: mode +0x0000000000000000 IDX +0x0000000040000000 UP +0x0000000080000000 DOWN +0x00000000c0000000 BFLY + +IMNMX: mode +0x0000080000000000 XLO +0x0000180000000000 XHI + +ISETP, ISET, ICMP: cmp +0x0002000000000000 LT +0x0004000000000000 EQ +0x0006000000000000 LE +0x0008000000000000 GT +0x000a000000000000 NE +0x000c000000000000 GE + +ISETP, ISET, PSETP, PSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +PSETP, PSET: bool2 +0x0000000000000000 AND +0x0000000001000000 OR +0x0000000002000000 XOR + +ISETP, ISET +0x0000080000000000 X + +LOP: bool +0x0000000000000000 AND +0x0000020000000000 OR +0x0000040000000000 XOR +0x0000060000000000 PASS_B + +LOP: +0x0000010000000000 INV + +LOP: z +0x0000200000000000 Z +0x0000300000000000 NZ + +LOP +0x0007000000000000 noz + +LOP32I: bool +0x0000000000000000 AND +0x0020000000000000 OR +0x0040000000000000 XOR + +PRMT: mode +0x0001000000000000 F4E +0x0002000000000000 B4E +0x0003000000000000 RC8 +0x0004000000000000 ECL +0x0005000000000000 ECR +0x0006000000000000 RC16 + +XMAD: type1 +0x0000000000000000 U16 +0x0001000000000000 S16 + +XMAD: type2 +0x0000000000000000 U16 +0x0002000000000000 S16 + +XMAD: mode +0x0000002000000000 MRG +0x0000001000000000 PSL +0x0008000000000000 CHI +0x0004000000000000 CLO +0x000c000000000000 CSFU + +XMAD: modec +0x0004000000000000 CLO +0x0008000000000000 CHI +0x000c000000000000 CSFU +0x0040000000000000 X +0x0080000000000000 PSL +0x0100000000000000 MRG + +XMAD +0x0010000000000000 CBCC + +XMAD: r8part +0x0000000000000000 H0 +0x0020000000000000 H1 + +XMAD: r20part +0x0000000000000000 H0 +0x0000000800000000 H1 + +XMAD: r20partx +0x0000000000000000 H0 +0x0010000000000000 H1 + +XMAD: r39part +0x0000000000000000 H0 +0x0010000000000000 H1 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part +0x0000000000000000 B0 +0x0000001000000000 B1 +0x0000002000000000 B2 +0x0000003000000000 B3 +0x0000001000000000 H1 +0x0000000000000000 H0 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part +0x0000000000000000 B0 +0x0000000010000000 B1 +0x0000000020000000 B2 +0x0000000030000000 B3 +0x0000000010000000 H1 +0x0000000000000000 H0 + +VMAD +0x0040000000000000 r8neg +0x0020000000000000 r39neg +0x0008000000000000 SHR_7 +0x0010000000000000 SHR_15 +0x0060000000000000 PO +0x0080000000000000 SAT + +VMNMX +0x0100000000000000 MX + +VADD, VABSDIFF, VMNMX +0x0080000000000000 SAT +0x0040000000000000 UD +0x0040000000000000 SD + +VSET: cmp +0x0040000000000000 LT +0x0080000000000000 EQ +0x00c0000000000000 LE +0x0100000000000000 GT +0x0140000000000000 NE +0x0180000000000000 GE + +VADD, VSET: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VABSDIFF: mode +0x0003000000000000 ACC +0x000b000000000000 MIN +0x0013000000000000 MAX +0x0023000000000000 MRG_16H +0x002b000000000000 MRG_16L +0x0033000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x003b000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMNMX: mode +0x0020000000000000 ACC +0x0028000000000000 MIN +0x0030000000000000 MAX +0x0000000000000000 MRG_16H +0x0008000000000000 MRG_16L +0x0010000000000000 MRG_8B0 +0x0000000000000000 MRG_8B1 +0x0018000000000000 MRG_8B2 +0x0000000000000000 MRG_8B3 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1 +0x0000000000000000 U +0x0001000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2 +0x0000000000000000 U +0x0002000000000000 S + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size1 +0x0000000000000000 8 +0x0000004000000000 16 +0x0000006000000000 32 + +VMAD, VADD, VABSDIFF, VMNMX, VSET: size2 +0x0000000000000000 8 +0x0000000040000000 16 +0x0000000060000000 32 + +IADD3: type +0x0001000000000000 X +0x0000002000000000 RS +0x0000004000000000 LS + +IADD3: r8part +0x0000000000000000 H0 +0x0000001000000000 H1 + +IADD3: r20part +0x0000000080000000 H0 + +IADD3: r39part +0x0000000200000000 H0 + +IADD3 +0x0008000000000000 r8neg +0x0004000000000000 r20neg +0x0002000000000000 r39neg + +IADD +0x0000080000000000 X +0x0004000000000000 SAT + +IADD, ISCADD +0x0002000000000000 r8neg +0x0001000000000000 r20neg + +IADD32I +0x0100000000000000 r8neg +0x0020000000000000 X + +DEPBAR: SB +0x0000000000000000 SB0 +0x0000000004000000 SB1 +0x0000000008000000 SB2 +0x000000000c000000 SB3 +0x0000000010000000 SB4 +0x0000000014000000 SB5 + +DEPBAR: cmp +0x0000000020000000 LE + +DEPBAR +0x0000000000000001 db0 +0x0000000000000002 db1 +0x0000000000000004 db2 +0x0000000000000008 db3 +0x0000000000000010 db4 +0x0000000000000020 db5 + +F2F, F2I, I2F, I2I: destWidth +0x0000000000000000 8 +0x0000000000000100 16 +0x0000000000000200 32 +0x0000000000000300 64 + +F2F, F2I, I2F, I2I: srcWidth +0x0000000000000000 8 +0x0000000000000400 16 +0x0000000000000800 32 +0x0000000000000c00 64 + +F2F, F2I, I2F, I2I: destSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000001000 S + +F2F, F2I, I2F, I2I: srcSign +0x0000000000000000 F +0x0000000000000000 U +0x0000000000002000 S + +F2I, I2F, I2I: r20part +0x0000000000000000 H0 +0x0000040000000000 H1 +0x0000000000000000 B0 +0x0000020000000000 B1 +0x0000040000000000 B2 +0x0000060000000000 B3 + +F2F: r20part +0x0000000000000000 H0 +0x0000020000000000 H1 + +F2F: round +0x0000040000000000 ROUND +0x0000048000000000 FLOOR +0x0000050000000000 CEIL +0x0000058000000000 TRUNC + +F2I: round +0x0000000000000000 ROUND +0x0000008000000000 FLOOR +0x0000010000000000 CEIL +0x0000018000000000 TRUNC + +HADD2, HMUL2: r8part +0x0001000000000000 H0_H0 +0x0000000000000000 H1_H1 + +HFMA2: r20part +0x0000000020000000 H0_H0 +0x0000000030000000 H1_H1 + +FADD, DADD, FMUL, DMUL, F2F, I2F: rnd +0x0000000000000000 RN +0x0000008000000000 RM +0x0000010000000000 RP +0x0000018000000000 RZ + +DFMA: rnd +0x0000000000000000 RN +0x0004000000000000 RM +0x0008000000000000 RP +0x000c000000000000 RZ + +FFMA: rnd +0x0000000000000000 RN +0x0008000000000000 RM +0x0010000000000000 RP +0x0018000000000000 RZ + +FFMA +0x0020000000000000 FTZ + +F2F, F2I, FADD, FMUL, FMNMX +0x0000100000000000 FTZ + +FADD32I +0x0080000000000000 FTZ + +FMUL32I +0x0020000000000000 FTZ + +FSET +0x0080000000000000 FTZ + +FSETP, FCMP +0x0000800000000000 FTZ + +HADD2, HMUL2 +0x0000008000000000 FTZ + +HFMA2 +0x0000002000000000 FTZ + +FADD, FFMA, FMUL, F2F, I2I +0x0004000000000000 SAT + +FADD, DADD, FMNMX, DMNMX, MUFU +0x0001000000000000 r8neg + +FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I +0x0000200000000000 r20neg + +FMUL, DMUL, FFMA, DFMA +0x0001000000000000 r20neg + +FFMA, DFMA +0x0002000000000000 r39neg + +FADD, DADD, FMNMX, DMNMX +0x0000400000000000 r8abs + +FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I +0x0002000000000000 r20abs + +FSETP, DSETP, FSET, DSET +0x0000080000000000 r8neg +0x0000000000000040 r20neg +0x0000000000000080 r8abs +0x0000100000000000 r20abs + +RRO: func +0x0000000000000000 SINCOS +0x0000008000000000 EX2 + +MUFU: func +0x0000000000000000 COS +0x0000000000100000 SIN +0x0000000000200000 EX2 +0x0000000000300000 LG2 +0x0000000000400000 RCP +0x0000000000500000 RSQ +0x0000000000600000 RCP64H +0x0000000000700000 RSQ64H + +FSETP, DSETP, FSET, DSET, FCMP: cmp +0x0001000000000000 .LT +0x0002000000000000 .EQ +0x0003000000000000 .LE +0x0004000000000000 .GT +0x0004000000000000 +0x0005000000000000 .NE +0x0006000000000000 .GE +0x0007000000000000 .NUM +0x0008000000000000 .NAN +0x0009000000000000 .LTU +0x000a000000000000 .EQU +0x000b000000000000 .LEU +0x000c000000000000 .GTU +0x000d000000000000 .NEU +0x000e000000000000 .GEU + +FSETP, DSETP, FSET, DSET: bool +0x0000000000000000 AND +0x0000200000000000 OR +0x0000400000000000 XOR + +HSETP2: cmp +0x0000002800000000 .NE + +HSETP2: bool +0x0000000000000000 AND + +S2R: sr +0x0000000000000000 LANEID +0x0000000000200000 VIRTCFG +0x0000000000300000 VIRTID +0x0000000002100000 TID.X +0x0000000002200000 TID.Y +0x0000000002300000 TID.Z +0x0000000002500000 CTAID.X +0x0000000002600000 CTAID.Y +0x0000000002700000 CTAID.Z +0x0000000003800000 EQMASK +0x0000000003900000 LTMASK +0x0000000003a00000 LEMASK +0x0000000003b00000 GTMASK +0x0000000003c00000 GEMASK + +CS2R: sr +0x0000000005000000 CLOCKLO +0x0000000005100000 CLOCKHI +0x0000000005200000 GLOBALTIMERLO +0x0000000005300000 GLOBALTIMERHI + +B2R +0x0000e00000000000 nop45 + +BAR +0x0000100000000000 i8w4 +0x0000080000000000 nor20 +0x0000038000000000 nop39 + +BAR: mode +0x0000000000000000 SYNC +0x0000000100000000 ARV +0x0000000200000000 RED + +BAR: red +0x0000000000000000 POPC +0x0000000800000000 AND +0x0000001000000000 OR + +MEMBAR: mode +0x0000000000000000 CTA +0x0000000000000100 GL +0x0000000000000200 SYS + +VOTE: mode +0x0000000000000000 ALL +0x0001000000000000 ANY +0x0002000000000000 EQ + +VOTE +0x00000000000000ff nor0 + +BRA +0x0000000000000080 U + +TLDS: chnls +0x0010000000000000 RGBA + +TLDS +0x0002000000000000 NODEP + +LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS +0x000000000000ff00 nor8 + +LD, ST: type +0x0000000000000000 .U8 +0x0020000000000000 .S8 +0x0040000000000000 .U16 +0x0060000000000000 .S16 +0x0080000000000000 +0x0080000000000000 .32 +0x00a0000000000000 .64 +0x00c0000000000000 .128 + +LD, ST: cache +0x0100000000000000 CG +0x0200000000000000 CS +0x0300000000000000 CV +0x0300000000000000 WT + +LDG, STG, LDS, STS, LDL, STL, LDC: type +0x0000000000000000 .U8 +0x0001000000000000 .S8 +0x0002000000000000 .U16 +0x0003000000000000 .S16 +0x0004000000000000 +0x0004000000000000 .32 +0x0005000000000000 .64 +0x0006000000000000 .128 + +LDG, STG: cache +0x0000400000000000 CG +0x0000800000000000 CI +0x0000800000000000 CS +0x0000c00000000000 CV +0x0000c00000000000 WT + +LDL: cache +0x0000200000000000 CI + +LDC: cache +0x0000100000000000 IL + +LDG, STG, LDS, STS, LDL, STL, LDC +0x0000200000000000 E + +LDS +0x0000100000000000 U + +RED: type +0x0000000000000000 +0x0000000000100000 .S32 +0x0000000000200000 .U64 +0x0000000000300000 .F32.FTZ.RN +0x0000000000400000 .F16x2.FTZ.RN +0x0000000000500000 .S64 + +RED: mode +0x0000000000000000 ADD +0x0000000000800000 MIN +0x0000000001000000 MAX +0x0000000001800000 INC +0x0000000002000000 DEC +0x0000000002800000 AND +0x0000000003000000 OR +0x0000000003800000 XOR + +ATOM: type +0x0000000000000000 +0x0002000000000000 .S32 +0x0004000000000000 .U64 +0x0006000000000000 .F32.FTZ.RN +0x0008000000000000 .F16x2.FTZ.RN +0x000a000000000000 .S64 +0x0002000000000000 .64 + +ATOM, RED +0x0001000000000000 E + +ATOM: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x03f0000000000000 CAS + +ATOMS: type +0x0000000000000000 +0x0000000010000000 .S32 +0x0000000020000000 .U64 +0x0000000030000000 .S64 +0x0010000000000000 .64 + +ATOMS: mode +0x0000000000000000 ADD +0x0010000000000000 MIN +0x0020000000000000 MAX +0x0030000000000000 INC +0x0040000000000000 DEC +0x0050000000000000 AND +0x0060000000000000 OR +0x0070000000000000 XOR +0x0080000000000000 EXCH +0x0240000000000000 CAS +}; + +# The existence of a capture group can map directly to an op code adjustment, or... +# The named capture group value can map the op code adjustmemt from among several options +our %flags; +my (@ops, $flag); +foreach my $line (@flags) +{ + if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)') + { + my $val = hex($1); + # named rules (op: name) + if ($flag) + { $flags{$_}{$flag}{$2} = $val foreach @ops; } + # simple existence check rules + else + { $flags{$_}{$2} = $val foreach @ops; } + } + else + { + my ($ops, $name) = split ':\s*', $line; + @ops = split ',\s*', $ops; + $flag = $name; + } +} + +sub parseInstruct +{ + my ($inst, $grammar) = @_; + return unless $inst =~ $grammar->{rule}; + my %capData = %+; + return \%capData; +} + +# for immediate or constant operands and a given opcode, bits 56-63 get transformed +my %immedOps = map { $_ => 1 } qw(i20 f20 d20); +my %immedCodes = +( + 0x5c => 0x64, + 0x5b => 0x6d, + 0x59 => 0x6b, + 0x58 => 0x68, +); +my %constCodes = +( + c20 => 0x10, + c39 => 0x08, +); +my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4); + +# just pick out the reuse code and nothing else +sub genReuseCode +{ + my $capData = shift; + my $reuse = 0; + $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes; + return $reuse; +} + +# Generate an op code from regex capture data +# if you pass in a test array ref it will populate it with the matching capture groups +sub genCode +{ + my ($op, $grammar, $capData, $test) = @_; + + my $flags = $flags{$op}; + my $code = $grammar->{code}; + my $reuse = 0; + my $immedCode = $immedCodes{$code >> 56}; + + #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I'; + + # process the instruction predicate (if valid for this instuction) + if (exists $capData->{noPred}) + { + delete $capData->{noPred}; + push @$test, 'noPred' if $test; + } + else + { + my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7; + push @$test, 'predNum' if $test; + if (exists $capData->{predNot}) + { + $p |= 8; + push @$test, 'predNot' if $test; + } + $code ^= $p << 16; + delete @{$capData}{qw(predNum predNot)}; + + } + # process the register reuse flags + foreach my $rcode (qw(reuse1 reuse2 reuse3)) + { + if (delete $capData->{$rcode}) + { + $reuse |= $reuseCodes{$rcode}; + push @$test, $rcode if $test; + } + } + + foreach my $capture (keys %$capData) + { + # change the base code for immediate versions of the op + if (exists $immedOps{$capture}) + { $code ^= $immedCode << 56; } + # change the base code for constant versions of the op + elsif (exists $constCodes{$capture}) + { $code ^= $constCodes{$capture} << 56; } + + # if capture group is an operand then process and add that data to code + if (exists $operands{$capture}) + { + # don't process the r20 that comes with the r39s20 capture + unless ($capture eq 'r20' && exists $capData->{r39s20}) + { + $code ^= $operands{$capture}->($capData->{$capture}); + push @$test, $capture if $test; + } + } + + # Add matching flags (an operand might also add/remove a flag) + if (exists $flags->{$capture}) + { + # a named multivalue flag + if (ref $flags->{$capture}) + { + $code ^= $flags->{$capture}{$capData->{$capture}}; + push @$test, "$capture:$capData->{$capture}" if $test; + } + # a simple exists flag + else + { + $code ^= $flags->{$capture}; + push @$test, $capture if $test; + } + } + elsif (!exists $operands{$capture} && !$test) + { + # Every capture group should be acted upon. Missing one is a bug. + warn "UNUSED: $op: $capture: $capData->{$capture}\n"; + warn Dumper($flags); + } + } + + return $code, $reuse; +} + + +my $CtrlRe = qr'(?[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])'; +my $PredRe = qr'(?@!?(?P\d)\s+)'; +my $InstRe = qr"$PredRe?(?\w+)(?[^;]*;)"o; +my $CommRe = qr'(?.*)'; + +sub processAsmLine +{ + my ($line, $lineNum) = @_; + + if ($line =~ m"^$CtrlRe(?\s+)$InstRe$CommRe"o) + { + return { + lineNum => $lineNum, + pred => $+{pred}, + predReg => $+{predReg}, + space => $+{space}, + op => $+{op}, + comment => $+{comment}, + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + ctrl => readCtrl($+{ctrl}, $line), + }; + } + return undef; +} + +sub processSassLine +{ + my $line = shift; + + if ($line =~ m"^\s+/\*(?[0-9a-f]+)\*/\s+$InstRe\s+/\* (?0x[0-9a-f]+)"o) + { + return { + num => hex($+{num}), + pred => $+{pred}, + op => $+{op}, + ins => normalizeSpacing($+{op} . $+{rest}), + inst => normalizeSpacing($+{pred} . $+{op} . $+{rest}), + code => hex($+{code}), + }; + } + return undef; +} + +sub processSassCtrlLine +{ + my ($line, $ctrl, $ruse) = @_; + + return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)'; + + my $code = hex($1); + if (ref $ctrl) + { + push @$ctrl, ($code & 0x000000000001ffff) >> 0; + push @$ctrl, ($code & 0x0000003fffe00000) >> 21; + push @$ctrl, ($code & 0x07fffc0000000000) >> 42; + } + if (ref $ruse) + { + push @$ruse, ($code & 0x00000000001e0000) >> 17; + push @$ruse, ($code & 0x000003c000000000) >> 38; + push @$ruse, ($code & 0x7800000000000000) >> 59; + } + return 1; +} + +sub replaceXMADs +{ + my $file = shift; + +# XMAD.LO d, a, b, c, x; +# ---------------------- +# XMAD.MRG x, a, b.H1, RZ; +# XMAD d, a, b, c; +# XMAD.PSL.CBCC d, a.H1, x.H1, d; +# ---------------------- +# XMAD d, a, 0xffff, c; +# XMAD.PSL d, a.H1, 0xffff, d; + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD\.LO\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s +%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s; +%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c x comment)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?-?$immed|\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/ + + die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + $file =~ s/\n\s*$CtrlRe(?\s+)($PredRe)?XMAD(?(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?\w+)\s*,\s*(?\w+)\s*,\s*(?c\[$hex\]\[$hex\]|\w+)\s*,\s*(?\w+)\s*;$CommRe/ + + die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a}; + sprintf ' +%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s +%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;', + @+{qw(ctrl space pred d a b c comment mod)} + /egmos; + + #TODO: add more XMAD macros + return $file; +} +# convert extra spaces to single spacing to make our re's simplier +sub normalizeSpacing +{ + my $inst = shift; + $inst =~ s/\t/ /g; + $inst =~ s/\s{2,}/ /g; + return $inst; +} + + +# map binary control notation on to easier to work with format. +sub printCtrl +{ + my $code = shift; + + my $stall = ($code & 0x0000f) >> 0; + my $yield = ($code & 0x00010) >> 4; + my $wrtdb = ($code & 0x000e0) >> 5; # write dependency barier + my $readb = ($code & 0x00700) >> 8; # read dependency barier + my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier + + $yield = $yield ? '-' : 'Y'; + $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1; + $readb = $readb == 7 ? '-' : $readb + 1; + $watdb = $watdb ? sprintf('%02x', $watdb) : '--'; + + return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall; +} +sub readCtrl +{ + my ($ctrl, $context) = @_; + my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl; + + $watdb = $watdb eq '--' ? 0 : hex $watdb; + $readb = $readb eq '-' ? 7 : $readb - 1; + $wrtdb = $wrtdb eq '-' ? 7 : $wrtdb - 1; + $yield = $yield eq 'y' || $yield eq 'Y' ? 0 : 1; + $stall = hex $stall; + + die sprintf('wait dep out of range(0x00-0x3f): %x at %s', $watdb, $context) if $watdb != ($watdb & 0x3f); + + return + $watdb << 11 | + $readb << 8 | + $wrtdb << 5 | + $yield << 4 | + $stall << 0; +} + +sub getRegNum +{ + my ($regMap, $regName) = @_; + + return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName}; +} + +sub getVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r0} or return; + + return if $regName eq 'RZ'; + + if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf') + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+3); + } + confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4; + return @{$vectors->{$regName}}; + } + return $regName; +} + +sub getAddrVecRegisters +{ + my ($vectors, $capData) = @_; + my $regName = $capData->{r8} or return; + + return if $regName eq 'RZ'; + + if (exists $capData->{E}) + { + if ($regName =~ m'^R(\d+)$') + { + return map "R$_", ($1 .. $1+1); + } + print Dumper($vectors) unless exists $vectors->{$regName}; + confess "$regName not a 64bit vector register" unless exists $vectors->{$regName}; + return @{$vectors->{$regName}}[0,1]; + } + return $regName; +} + +__END__ + + + diff --git a/Assembler/PascalAs/microbench/microbench.cpp b/Assembler/PascalAs/microbench/microbench.cpp new file mode 100644 index 0000000..7b0187a --- /dev/null +++ b/Assembler/PascalAs/microbench/microbench.cpp @@ -0,0 +1,212 @@ +// microbench.cpp : Defines the entry point for the console application. +// + +// nvcc -l cuda -o microbench microbench.cpp + +#include +#include +#include +#include +#include + +CUcontext hContext = 0; + +#define CUDA_CHECK( fn ) do { \ + CUresult status = (fn); \ + if ( CUDA_SUCCESS != status ) { \ + const char* errstr; \ + cuGetErrorString(status, &errstr); \ + printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \ + if (hContext) cuCtxDestroy(hContext); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + + +int main(int argc, char* argv[]) +{ + //int iTest = 2896; + //while (iTest < 0x7fff) + //{ + // int iResult = iTest * iTest; + // float fTest = (float)iTest; + // int fResult = (int)(fTest * fTest); + + // printf("i*i:%08x f*f:%08x\n", iResult, fResult); + + // iTest += 0x0800; + //} + //exit(0); + + char deviceName[32]; + int devCount, ordinal, major, minor; + CUdevice hDevice; + + // Initialize the Driver API and find a device + CUDA_CHECK( cuInit(0) ); + CUDA_CHECK( cuDeviceGetCount(&devCount) ); + for (ordinal = 0; ordinal < devCount; ordinal++) + { + CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); + CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); + CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); + CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); + if (major >= 5 && minor >= 2) + { + printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); + break; + } + } + if (ordinal == devCount) + { + printf("No compute 5.0 device found, exiting.\n"); + exit(EXIT_FAILURE); + } + + // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing + int internalTiming = 1; + if (argc > 1) + internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; + + // Second command line arg is the number of blocks + int blocks = 1; + if (argc > 2) + blocks = atoi(argv[2]); + if (blocks < 1) + blocks = 1; + + // Third command line arg is the number of threads + int threads = 128; + if (argc > 3) + threads = atoi(argv[3]); + if (threads > 1024 || threads < 32) + threads = 128; + threads &= -32; + + // Forth command line arg: + double fops = 1.0; + int lanes = 1; + if (argc > 4) + { + if (internalTiming) + { + // The number of lanes to print for each warp + lanes = atoi(argv[4]); + if (lanes > 32 || lanes < 1) + lanes = 1; + } + else + // The number of floating point operations in a full kernel launch + fops = atof(argv[4]); + } + + // Fifth command line arg is the repeat count for benchmarking + int repeat = 1; + if (argc > 5) + repeat = atoi(argv[5]); + if (repeat > 1000 || repeat < 1) + repeat = 1; + + // threads = total number of threads + size_t size = sizeof(int) * threads * blocks; + + // Setup our input and output buffers + int* dataIn = (int*)malloc(size); + int* dataOut = (int*)malloc(size); + int* clocks = (int*)malloc(size); + memset(dataIn, 0, size); + + CUmodule hModule; + CUfunction hKernel; + CUevent hStart, hStop; + CUdeviceptr devIn, devOut, devClocks; + + // Init our context and device memory buffers + CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); + CUDA_CHECK( cuMemAlloc(&devIn, size) ); + CUDA_CHECK( cuMemAlloc(&devOut, size) ); + CUDA_CHECK( cuMemAlloc(&devClocks, size) ); + CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); + CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); + CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); + + CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); + CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); + + // Load our kernel + CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); + CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); + + // Setup the params + void* params[] = { &devOut, &devClocks, &devIn }; + float ms = 0; + + // Warm up the clock (unless under nsight) + if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER + for (int i = 0; i < repeat; i++) + CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); + + // Launch the kernel + CUDA_CHECK( cuEventRecord(hStart, NULL) ); + //CUDA_CHECK( cuProfilerStart() ); + CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); + //CUDA_CHECK( cuProfilerStop() ); + CUDA_CHECK( cuEventRecord(hStop, NULL) ); + CUDA_CHECK( cuEventSynchronize(hStop) ); + CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); + + //CUDA_CHECK( cuCtxSynchronize() ); + + // Get back our results from each kernel + CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); + CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); + + // Cleanup and shutdown of cuda + CUDA_CHECK( cuEventDestroy(hStart) ); + CUDA_CHECK( cuEventDestroy(hStop) ); + CUDA_CHECK( cuModuleUnload(hModule) ); + CUDA_CHECK( cuMemFree(devIn) ); + CUDA_CHECK( cuMemFree(devOut) ); + CUDA_CHECK( cuMemFree(devClocks) ); + CUDA_CHECK( cuCtxDestroy(hContext) ); + hContext = 0; + + // When using just one block, print out the internal timing data + if (internalTiming) + { + int count = 0, total = 0, min = 999999, max = 0; + + int* clocks_p = clocks; + int* dataOut_p = dataOut; + + // Loop over and print results + for (int blk = 0; blk < blocks; blk++) + { + float *fDataOut = reinterpret_cast(dataOut_p); + + for(int tid = 0; tid < threads; tid += 32) + { + // Sometimes we want data on each thread, sometimes just one sample per warp is fine + for (int lane = 0; lane < lanes; lane++) + printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u + + count++; + total += clocks_p[tid]; + if (clocks_p[tid] < min) min = clocks_p[tid]; + if (clocks_p[tid] > max) max = clocks_p[tid]; + } + clocks_p += threads; + dataOut_p += threads; + } + printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); + } + else + { + // For more than one block we're testing throughput and want external timing data + printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); + } + // And free up host memory + free(dataIn); free(dataOut); free(clocks); + + return 0; +} diff --git a/Assembler/PascalAs/microbench/microbench.cu b/Assembler/PascalAs/microbench/microbench.cu new file mode 100644 index 0000000..7d4cd8f --- /dev/null +++ b/Assembler/PascalAs/microbench/microbench.cu @@ -0,0 +1,69 @@ + +// Note this file isn't configured to automatically compile + +#include +#include + +// Build: +// nvcc -l cuda -o microbench microbench.cpp +// nvcc -arch sm_50 -cubin microbench.cu + +// Inspect a cubin (use nvdisasm from cuda 6.5 for best results): +// maxas.pl -e microbench.cubin + +// Insert new sass into cubin +// maxas.pl -i microbench.sass microbench.cubin + +// run it: +// ./microbench + +// Use extern C so C++ doesn't mangle our kernel name +extern "C" __global__ void microbench(int *out, int *clocks, int *in) +{ + __shared__ int share[1024]; + + int tid = threadIdx.x; + int bx = blockIdx.x; + int by = blockIdx.y; + + int start = clock(); + + share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ + + __syncthreads(); + + int end = clock(); + + clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start; + + out[tid] = share[tid ^ 1]; +} + +// A note about using the Cuda Runtime. +// If that's your preference over the driver API then here's what you'd do: + +// In your project properties in the Cuda C/C++ panel: +// -Set the "Keep Processed Files" (-keep) option +// -Add a -v manually to the command line +// If compiling on command line just add -keep -v options to nvcc. +// Rebuild your solution and look in the log for these lines that follow the ptxas step: + +// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda +// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" +// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" + +// You just need to manually run these 3 commands (or add them to a build script) +// after you've modified the cubin generated from the preceeding ptxas command. +// That will give you a new .cu.obj file which will automatically be linked in for you next time you +// build your project (or you could manually run the linker step as well). + +// Having done that you can call your kernel normally using the <<< >>> syntax. +// Debugging will have to be with the sass syntax but that's what you'll want to see anyway. +// With fatbin you can also keep non-maxwell optimized versions of your code. + + +// I just discovered this also works as a shortcut to the above: +// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu + +// The cu kernel definitions above need to have empty bodies. +// And, the cu file must be compiled to a lib seperately before linking. \ No newline at end of file diff --git a/Assembler/PascalAs/microbench/microbench.sass b/Assembler/PascalAs/microbench/microbench.sass new file mode 100644 index 0000000..609274a --- /dev/null +++ b/Assembler/PascalAs/microbench/microbench.sass @@ -0,0 +1,72 @@ +# Kernel: microbench + +// This is a simple micro bench to demonstrate the latency in loading SR_TID.X + + + blockDimX : c[0x0][0x08] + blockDimY : c[0x0][0x0c] + blockDimZ : c[0x0][0x10] + gridDimX : c[0x0][0x14] + gridDimY : c[0x0][0x18] + gridDimZ : c[0x0][0x1c] + + param_out[0] : c[0x0][0x140] + param_out[1] : c[0x0][0x144] + param_clocks[0] : c[0x0][0x148] + param_clocks[1] : c[0x0][0x14c] + param_in[0] : c[0x0][0x150] + param_in[1] : c[0x0][0x154] + + + + + 0-1 : out<0-1> + 2-3 : clocks<0-1> + 4-5 : in<0-1> + 6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x + + + +// Load in our params (not currently used below) +--:-:-:-:1 MOV in0, param_in[0]; +--:-:-:-:1 MOV in1, param_in[1]; + +// Get the first clock value +--:-:-:-:1 CS2R clock1, SR_CLOCKLO; + +// Get the threadId and blockId +// Set the Read-After-Write dependency barrier 1 and 2 +--:-:1:-:1 S2R tid, SR_TID.X; +// Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it +--:-:2:-:2 S2R bid, SR_CTAID.X; + + +// Get the second clock value +// Wait on the depenedency barriers that were set in the prior instruction +// Stall 6 to allow CS2R time to complete before next instruction +// CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks +// This stall count does not factor into the time calculation at all +03:-:-:-:6 CS2R clock2, SR_CLOCKLO; + +// Take the difference of clocks +--:-:-:-:1 IADD clock1, clock2, -clock1; + +// Setup our output addresses +// Stall your pipeline dependencies properly +// Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code +--:-:-:-:6 XMAD offset, bid, blockDimX, tid; + +// LEA is "load effective address" +// The offset param is shifted left 2 and added to the pointers with 64bit math +--:-:-:-:6 LEA clocks0.CC, offset, param_clocks[0], 2; +--:-:-:-:1 LEA.HI.X clocks1, offset, param_clocks[1], RZ, 2; + +--:-:-:-:6 LEA out0.CC, offset, param_out[0], 2; +--:-:-:-:1 LEA.HI.X out1, offset, param_out[1], RZ, 2; + +// Output the results. +// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values +--:-:-:-:1 STG.E [clocks], clock1; +--:-:-:-:1 STG.E [out], offset; # use this to return whatever you like to inspect the results +--:-:-:-:5 EXIT; + diff --git a/Assembler/PascalAs/microbench/shared.pl b/Assembler/PascalAs/microbench/shared.pl new file mode 100755 index 0000000..f760664 --- /dev/null +++ b/Assembler/PascalAs/microbench/shared.pl @@ -0,0 +1,12 @@ +#!/usr/bin/perl +use strict; + +print `maxas.pl -i shared_sts16.sass microbench.cubin`; + +exit if $?; + +print `Release\\microbench.exe i 1 64`; + + +__END__ + diff --git a/Assembler/PascalAs/microbench/shared_lds.sass b/Assembler/PascalAs/microbench/shared_lds.sass new file mode 100644 index 0000000..5f31dcf --- /dev/null +++ b/Assembler/PascalAs/microbench/shared_lds.sass @@ -0,0 +1,122 @@ +# Kernel: microbench +# InsCnt: 18 +# RegCnt: 5 +# SharedSize: 4096 +# BarCnt: 1 +# Params(3): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 + +// This is a simple micro bench to demonstrate the latency in loading SR_TID.X + + + + 0-3 : result, a, b, c + + 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20> + + + +// Load in our params +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R bid, SR_CTAID.X; + +--:-:-:-:1 MOV result, c[0x0][0x0]; +--:-:-:-:1 MOV in, c[0x0][0x100]; + +--:-:-:-:1 CS2R clock1, SR_CLOCKLO; +--:-:-:-:1 MOV result, c[0x0][0x13c]; +--:-:-:-:1 CS2R clock2, SR_CLOCKLO; + +--:-:-:-:1 MOV blockDim, c[0x0][0x8]; +--:-:-:-:1 MOV out, c[0x0][0x140]; +--:-:-:-:1 MOV clocks, c[0x0][0x144]; + + + + + + +03:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 LOP.AND tid128, tid, 128; + +// readAs = ((tid128 >> 4) | tid7) << 4 +--:-:-:-:1 SHR.U32 readAs, tid128, 4; +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid96 >> 3) | tid3) << 4 +--:-:-:-:1 SHR.U32 readBs, tid96, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid3; +#--:-:-:-:1 SHL readBs, readBs, 4; +#--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; + + + + + + +#--:-:-:-:1 LDS.U.128 result, [readBs]; + + + + +01:-:-:-:1 IADD clock1, clock2, -clock1; + + +--:-:-:-:1 XMAD tid, blockDim, bid, tid; +--:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; +--:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; +--:-:-:Y:6 SHL tid, tid, 0x2; + +--:-:-:-:1 IADD clocks, clocks, tid; +--:-:-:-:2 IADD out, out, tid; + +--:-:-:-:1 STG [clocks], clock1; +--:-:-:-:1 STG [out], readBs; +--:-:-:-:5 EXIT; + + + +--:-:-:-:4 LOP.AND tid32, tid, -32; + +--:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; + +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; + + +// readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; +--:-:-:-:1 LOP.AND readAs, tid, 0x80; +--:-:-:-:1 SHR.U32 readAs, readAs, 4; +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid1, tid, 0x1; +--:-:-:-:1 LOP.AND readBs, tid, 0x70; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; + + + \ No newline at end of file diff --git a/Assembler/PascalAs/microbench/shared_sts16.sass b/Assembler/PascalAs/microbench/shared_sts16.sass new file mode 100644 index 0000000..2f6eb39 --- /dev/null +++ b/Assembler/PascalAs/microbench/shared_sts16.sass @@ -0,0 +1,116 @@ +# Kernel: microbench +# InsCnt: 18 +# RegCnt: 5 +# SharedSize: 4096 +# BarCnt: 1 +# Params(3): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 + +// This is a simple micro bench to demonstrate the latency in loading SR_TID.X + + + + 0-3 : result, a, b, c + + 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20> + + + +// Load in our params +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R bid, SR_CTAID.X; + +//--:-:-:-:1 MOV result, c[0x0][0x0]; +//--:-:-:-:1 MOV in, c[0x0][0x100]; +--:-:-:-:1 MOV result, 1; + +--:-:-:-:1 MOV blockDim, c[0x0][0x8]; +--:-:-:-:1 MOV out, c[0x0][0x140]; +--:-:-:-:1 MOV clocks, c[0x0][0x144]; + + +// readAs = ((tid >> 1) & 7) << 4; +03:-:-:-:6 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:6 SHL readAs, readAs, 3; + +// readBs = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024; +--:-:-:-:6 LOP.AND tid1, tid, 1; +--:-:-:-:6 LOP.AND readBs, tid, 0x30; +--:-:-:-:6 SHR.U32 readBs, readBs, 3; +--:-:-:-:6 LOP.OR readBs, readBs, tid1; +--:-:-:-:6 ISCADD readBs, readBs, 0, 3; + + + +///--:-:-:-:1 STS [tid32], result; +//--:-:-:-:1 STS.S16 [tid32 + 2x<32>], result; +//--:-:1:-:2 LDS.U.64 result, [readBs]; + +--:-:-:-:0 CS2R clock1, SR_CLOCKLO; +--:-:1:-:6 LDS.U.64 result, [readAs]; +--:-:-:-:6 CS2R clock2, SR_CLOCKLO; + + +01:-:-:-:1 IADD clock1, clock2, -clock1; + + +--:-:-:-:1 XMAD tid, blockDim, bid, tid; +--:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; +--:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; +--:-:-:Y:6 SHL tid, tid, 0x2; + +--:-:-:-:1 IADD clocks, clocks, tid; +--:-:-:-:2 IADD out, out, tid; + +--:-:-:-:1 STG [clocks], clock1; +--:-:-:-:1 STG [out], result; +--:-:-:-:5 EXIT; + + + +--:-:-:-:4 LOP.AND tid32, tid, -32; + +--:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; + +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; +--:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; + +03:-:-:-:6 LOP.AND tid31, tid, 31; +--:-:-:-:6 LOP.AND tid32, tid, 32; +--:-:-:-:6 SHL tid32, tid32, 0x2; +--:-:-:-:6 LOP.OR tid32, tid32, tid31; +--:-:-:-:6 SHL tid32, tid32, 0x2; + +// readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; +--:-:-:-:1 LOP.AND readAs, tid, 0x80; +--:-:-:-:1 SHR.U32 readAs, readAs, 4; +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid1, tid, 0x1; +--:-:-:-:1 LOP.AND readBs, tid, 0x70; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; + + + \ No newline at end of file diff --git a/Assembler/PascalAs/microbench/throughput.pl b/Assembler/PascalAs/microbench/throughput.pl new file mode 100755 index 0000000..56df6e7 --- /dev/null +++ b/Assembler/PascalAs/microbench/throughput.pl @@ -0,0 +1,80 @@ +#!/usr/bin/perl +use strict; + +my $loopSize = 512; +my $blocks = 32; +my $loops = 10240000; +my $fileName = 'throughput2.sass'; + +writeSassFile($fileName, $loops); + +#print `maxas.pl -p $fileName`; +#exit; + +print `maxas.pl -i $fileName microbench.cubin`; +exit if $?; + +foreach my $thread128 (2) +{ + my $threads = $thread128 * 128; + my $fops = 2 * $loops * $loopSize * $blocks * $threads; + + my $data = `Release\\microbench.exe e $blocks $threads $fops`; + + my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + + printf "%d %d %d\n", $thread128, $threads, $gflops; +} + +exit; + +sub writeSassFile +{ + my ($filename, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'EOF', $loops; +# Kernel: microbench + + + + 0-10 : result, r1, r2, r3 + 20-27 ~ count, stop + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +--:-:-:-:1 MOV32I r1, 1.0; +--:-:-:-:1 MOV32I r2, 1.0; +--:-:-:-:4 MOV32I r3, 1.0; + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + foreach my $i (0 .. 511) + { + my $yield = ($i + 32) & 63 ? '-' : 'Y'; + + my $stall = $i == 511 ? 0 : 1; + + $out .= "--:-:-:$yield:$stall FFMA result, r1, r2, r3;\n"; + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +EOF + + close $fh; +} + +__END__ + diff --git a/Assembler/PascalAs/microbench/throughput.sass b/Assembler/PascalAs/microbench/throughput.sass new file mode 100644 index 0000000..796502f --- /dev/null +++ b/Assembler/PascalAs/microbench/throughput.sass @@ -0,0 +1,95 @@ +# Kernel: microbench +# InsCnt: 18 +# RegCnt: 5 +# SharedSize: 4096 +# BarCnt: 1 +# Params(3): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 + + + + 8-20 : count + + + +--:-:-:-:1 MOV R0, RZ; +--:-:-:-:1 MOV R1, RZ; +--:-:-:-:1 MOV R2, RZ; +--:-:-:-:1 MOV R3, RZ; +--:-:-:-:1 MOV R4, RZ; +--:-:-:-:1 MOV R5, RZ; +--:-:-:-:1 MOV R6, RZ; +--:-:-:-:1 MOV R7, RZ; +--:-:-:-:1 MOV R8, RZ; +--:-:-:Y:6 MOV count, RZ; + +// This loop is capable of running at 1700 GFlops on GM107. +// You can tweak it to see how register bank conflicts or different control codes +// effect performance. +// With thoughput.pl you can pass params to this code and do some autotuning. +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, 0x19000, PT; +--:-:-:-:1 IADD count, count, 0x1; + + + my $out; + + foreach my $i (0..511) #511 + { + my $y = ($i + 32) & 63 ? '-' : 'Y'; + + $out .= qq| +--:-:-:$y:1 FFMA R0, R1, R2, R3;|; #c[0x0][$c] + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; + +--:-:-:-:5 EXIT; + + + + + open my $fh, 'params.txt'; + my $line = <$fh>; + close $fh; + my ($r1, $r2, $r3) = split "\t", $line; + + 80-95 : out, clocks, in, tid, clock1, clock2, result + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:-:-:1 MOV out, c[0x0][0x140]; +--:-:-:-:1 MOV clocks, c[0x0][0x144]; +01:-:-:-:1 MOV in, c[0x0][0x148]; + + + +--:-:-:-:1 MOV32I f0, 0x3f800000; +--:-:-:-:1 MOV32I f1, 0x3f800000; +--:-:-:-:1 MOV32I f2, 0x3f800000; +--:-:-:-:5 MOV32I f3, 0x3f800000; + +--:-:-:-:1 CS2R clock1, SR_CLOCKLO; + + +--:-:-:-:1 CS2R clock2, SR_CLOCKLO; + +--:-:-:-:6 MOV32I result, 0x457; +--:-:-:-:1 IADD clock1, clock2, -clock1; + + +--:-:-:-:6 SHL tid, tid, 0x2; +--:-:-:-:1 IADD clocks, clocks, tid; +--:-:-:-:1 IADD out, out, tid; + +--:-:-:-:1 STG [clocks], clock1; +--:-:-:-:1 STG [out], R24; + + + \ No newline at end of file diff --git a/Assembler/PascalAs/microbench/throughput2.pl b/Assembler/PascalAs/microbench/throughput2.pl new file mode 100755 index 0000000..ea7e19f --- /dev/null +++ b/Assembler/PascalAs/microbench/throughput2.pl @@ -0,0 +1,158 @@ +#!/usr/bin/perl +use strict; +my %p; + +$p{N} = 8192; +$p{blocking} = 8; +$p{unroll} = 8; +$p{threads} = 64; #256 + +$p{csize} = $p{blocking} * $p{blocking}; +$p{loopSize} = $p{unroll} * $p{csize}; +$p{width} = sqrt($p{csize} * $p{threads}); +$p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); +$p{loops} = $p{N} / $p{unroll}; +$p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; + +my $fileName = 'throughput2.sass'; + +my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); + +#print join("\t", @params), "\n"; +#print join("\t", @p{@params}), "\n"; + +print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; + +writeSassFile($fileName, $p{loopSize}, $p{loops}); + +#print `maxas.pl -p $fileName`; +#exit; + +print `maxas.pl -i $fileName microbench.cubin`; + +exit if $?; + +my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; + +my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + +print $data; + +#printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; + + + + +sub writeSassFile +{ + my ($filename, $loopSize, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'END_SASS', $loops; +# Kernel: microbench + + + + 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> + 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> + 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> + 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> + 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> + + 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> + 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> + + 0-127 : r<0-127> + + 100-101 : count, stop + + //102-112 ~ readAs, readBs, writeS + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +//--:-:-:-:1 MOV writeS, RZ; +//--:-:-:-:1 MOV readAs, RZ; +//--:-:-:-:1 MOV readBs, RZ; + + + return join '', map "--:-:-:-:1 MOV32I r$_, 1.0;\n", 0..95; + + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + + my @cOrder; + #my @swirl = ([0,1],[0,0],[2,0],[2,1]); + my @swirl = ([2,0],[2,1],[0,1],[0,0]); + #my @swirl = ([0,1],[0,0],[1,0],[1,1]); + my @xVals = (0,1,64,65); + #my @xVals = (0,2,64,66); + + my @yVals = (0,2,64,66); + + foreach my $y (@yVals) + { + foreach my $x (@xVals) + { + push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl; + } + @xVals = reverse @xVals; + } + + foreach my $j (0..7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + + my %%insert; + + #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; + + $insert{c62} = + "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . + "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . + "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . + "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . + "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; + + foreach my $c (0 .. 63) + { + my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/; + my $ins = $insert{"c$c"} || ''; + my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || + my $yield = $c == 32 ? 'Y' : '-'; + my $wait = '--'; #$c ? '--' : '01'; + + $out .= "$wait:-:-:$yield:$stall FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins"; + } + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +END_SASS + + close $fh; +} + +__END__ + + my %%insert = ( + c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", + c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", + c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", + c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", + ); \ No newline at end of file diff --git a/Assembler/PascalAs/microbench/throughput2.sass b/Assembler/PascalAs/microbench/throughput2.sass new file mode 100644 index 0000000..3db5130 --- /dev/null +++ b/Assembler/PascalAs/microbench/throughput2.sass @@ -0,0 +1,47 @@ +# Kernel: microbench + + + + 0-10 : result, r1, r2, r3 + 20-27 ~ count, stop + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, 102400; +--:-:-:-:1 MOV32I r1, 1.0; +--:-:-:-:1 MOV32I r2, 1.0; +--:-:-:-:4 MOV32I r3, 1.0; + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + foreach my $i (0 .. 511) + { + my $yield = ($i + 32) & 63 ? '-' : 'Y'; + + my $stall = $i == 511 ? 0 : 1; + + #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; + + #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; + #$out .= "--:-:-:-:1 MOV result, RZ;\n"; + + $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; + #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; + #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; diff --git a/Assembler/PascalAs/microbench/throughput3.pl b/Assembler/PascalAs/microbench/throughput3.pl new file mode 100755 index 0000000..ff9077a --- /dev/null +++ b/Assembler/PascalAs/microbench/throughput3.pl @@ -0,0 +1,90 @@ +#!/usr/bin/perl +use strict; + +my %data; + +foreach my $thread128 (1 .. 8) +{ + foreach my $size64 (8 .. 16) + { + my $loopSize = $size64 * 64; + my $loops = int(2 * 1638400 / ($size64 * $thread128)); + + my $blocks = 16; + my $threads = $thread128 * 128; + my $fops = 2 * $loops * $loopSize * $blocks * $threads; + my $fileName = 'throughput2.sass'; + + #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops; + #next; + + writeSassFile($fileName, $loopSize, $loops); + + `maxas.pl -i $fileName microbench.cubin`; + + exit if $?; + + my $data = `Release\\microbench.exe e $blocks $threads $fops`; + + my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + + printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; + + push @{$data{$loopSize}}, $gflops; + } +} +print join("\t", 'size', 1 .. 8), "\n"; +foreach my $loopSize (sort {$a <=> $b} keys %data) +{ + print join("\t", $loopSize, @{$data{$loopSize}}), "\n"; +} + +exit; + +sub writeSassFile +{ + my ($filename, $loopSize, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'EOF', $loops, $loopSize, $loopSize; +# Kernel: microbench + + + + 0-10 : result, r1, r2, r3, count, stop + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +--:-:-:-:1 MOV32I r1, 1.0; +--:-:-:-:1 MOV32I r2, 1.0; +--:-:-:-:4 MOV32I r3, 1.0; + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + foreach my $i (0 .. %d) + { + my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y'; + + $out .= "--:-:-:$y:1 FFMA result, r1, r2, r3;\n"; + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +EOF + + close $fh; +} + +__END__ + diff --git a/Assembler/PascalAs/microbench/throughput4.pl b/Assembler/PascalAs/microbench/throughput4.pl new file mode 100755 index 0000000..8f8760c --- /dev/null +++ b/Assembler/PascalAs/microbench/throughput4.pl @@ -0,0 +1,120 @@ +#!/usr/bin/perl +use strict; + +my $loopSize = 512; +my $blocks = 64; +my $loops = 102400; +my $fileName = 'throughput2.sass'; + +writeSassFile($fileName, $loops); + +#print `maxas.pl -p $fileName`; +#exit; + +print `maxas.pl -i $fileName microbench.cubin`; +exit if $?; + +foreach my $thread128 (4) +{ + my $threads = $thread128 * 128; + my $fops = 2 * $loops * $loopSize * $blocks * $threads; + + print "./microbench e $blocks $threads $fops\n\n"; + my $data = `./microbench e $blocks $threads $fops`; + exit($?) if $?; + + my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + + printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0; +} + +exit; + +sub writeSassFile +{ + my ($filename, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'EOF', $loops; +# Kernel: microbench + + + + 0-10 : result, r1, r2, r3 + 20-27 ~ count, stop + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +--:-:-:-:1 MOV32I r1, 1.0; +--:-:-:-:1 MOV32I r2, 1.0; +--:-:-:-:4 MOV32I r3, 1.0; + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + foreach my $i (0 .. 511) + { + my $yield = ($i + 32) & 63 ? '-' : 'Y'; + + my $stall = $i == 511 ? 0 : 1; + + #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; + #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; + + #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; + #$out .= "--:-:-:-:1 MOV result, RZ;\n"; + + $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; + #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; + #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +EOF + + close $fh; +} + +__END__ + +VMAD.U8.U8 + +dddd 2655 / 4968 = 53.4% +1d1d 4594 / 4968 = 92.4% +11d 4746 / 4968 = 95.5% +111d 4841 / 4968 = 97.4% + +block context switches are a little more expensive than thread context switches + +stall codes: + +f : 13 clocks +e : 8 clocks +d : 6 clocks +c : 8 clocks, no yield +b : 11 clocks +a : 10 clocks +9 : 9 clocks +8 : 8 clocks +7 : 7 clocks +6 : 6 clocks +5 : 5 clocks +4 : 4 clocks +3 : 3 clocks +2 : 2 clocks +1 : 1 clocks, no yield +0 : 0 clocks, no yield, dual issue \ No newline at end of file diff --git a/Assembler/PascalAs/microbench/throughput5.pl b/Assembler/PascalAs/microbench/throughput5.pl new file mode 100755 index 0000000..f9bda8e --- /dev/null +++ b/Assembler/PascalAs/microbench/throughput5.pl @@ -0,0 +1,164 @@ +#!/usr/bin/perl +use strict; +my %p; + +$p{N} = 8192; +$p{blocking} = 8; +$p{unroll} = 8; +$p{threads} = 64; #256 + +$p{csize} = $p{blocking} * $p{blocking}; +$p{loopSize} = $p{unroll} * $p{csize}; +$p{width} = sqrt($p{csize} * $p{threads}); +$p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); +$p{loops} = $p{N} / $p{unroll}; +$p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; + +my $fileName = 'throughput2.sass'; + +my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); + +#print join("\t", @params), "\n"; +#print join("\t", @p{@params}), "\n"; + +print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; + +writeSassFile($fileName, $p{loopSize}, $p{loops}); + +#print `maxas.pl -p $fileName`; +#exit; + +print `maxas.pl -i $fileName microbench.cubin`; + +exit if $?; + +my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; + +my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; + +print $data; + +#printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; + + + + +sub writeSassFile +{ + my ($filename, $loopSize, $loops) = @_; + + open my $fh, ">$filename" or die "$filename: $!"; + + printf $fh <<'END_SASS', $loops; +# Kernel: microbench + + + + 1, 9, 2,10,17,25,18,26 : cy0x<0-7> + 5,13, 6,14,21,29,22,30 : cy1x<0-7> + 3,11, 0, 8,19,27,16,24 : cy2x<0-7> + 7,15, 4,12,23,31,20,28 : cy3x<0-7> + 35,43,32,40,51,59,48,56 : cy4x<0-7> + 39,47,36,44,55,63,52,60 : cy5x<0-7> + 33,41,34,42,49,57,50,58 : cy6x<0-7> + 37,45,38,46,53,61,54,62 : cy7x<0-7> + + 64-71 : j0Ax<0-3>, j0By<0-3> + 72-79 : j1Ax<0-3>, j1By<0-3> + + 0-79 : r<0-79> + + 100-101 : count, stop + + //102-112 ~ readAs, readBs, writeS + + + +--:-:-:-:1 MOV count, RZ; +--:-:-:-:1 MOV32I stop, %d; +//--:-:-:-:1 MOV writeS, RZ; +//--:-:-:-:1 MOV readAs, RZ; +//--:-:-:-:1 MOV readBs, RZ; + + + return join '', map "--:-:-:-:1 MOV r$_, RZ;\n", 0..63; + + + + return join '', map "--:-:-:-:1 MOV32I r$_, 0x00010001;\n", 64..79; + + +LOOP: + +--:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; +--:-:-:-:1 IADD count, count, 1; + + + my $out; + + my @swirl1 = ([0,0],[0,4],[4,4],[4,0]); + my @swirl2 = ([0,0],[1,0],[1,1],[0,1]); + my @swirl3 = ([0,2],[2,2],[2,0],[0,0]); + + my @cOrder; + foreach my $s1 (@swirl1) + { + foreach my $s2 (@swirl2) + { + foreach my $s3 (@swirl3) + { + push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]]; + } + } + } + + foreach my $j (0..7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + + my %%insert; + + #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; + + $insert{c62} = + "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . + "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . + "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . + "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . + "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + my $ins = $insert{"c$c"} || ''; + my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || + my $yield = $c == 32 ? 'Y' : '-'; + my $wait = '--'; #$c ? '--' : '01'; + + my $xReg = $x >> 1; + my $yReg = $y >> 1; + my $xPart = $x & 1 ? '.H1' : ''; + my $yPart = $y & 1 ? '.H1' : ''; + + $out .= sprintf "$wait:-:-:$yield:$stall XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x, $odd,$xReg,$xPart, $odd,$yReg,$yPart, $y,$x, $ins; + } + } + return $out; + + +--:-:-:Y:5 @P0 BRA LOOP; +--:-:-:-:5 EXIT; +END_SASS + + close $fh; +} + +__END__ + + my %%insert = ( + c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", + c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", + c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", + c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", + ); \ No newline at end of file diff --git a/Assembler/PascalAs/microbench/xmad.pl b/Assembler/PascalAs/microbench/xmad.pl new file mode 100755 index 0000000..6aadb89 --- /dev/null +++ b/Assembler/PascalAs/microbench/xmad.pl @@ -0,0 +1,12 @@ +#!/usr/bin/perl +use strict; + +print `maxas.pl -i xmad2.sass microbench.cubin`; + +exit if $?; + +print `./microbench i 1 128`; + + +__END__ + diff --git a/Assembler/PascalAs/microbench/xmad2.sass b/Assembler/PascalAs/microbench/xmad2.sass new file mode 100644 index 0000000..f0ce936 --- /dev/null +++ b/Assembler/PascalAs/microbench/xmad2.sass @@ -0,0 +1,144 @@ +# Kernel: microbench +# InsCnt: 18 +# RegCnt: 5 +# SharedSize: 4096 +# BarCnt: 1 +# Params(3): +# ord:addr:size:align +# 0:0x140:8:0 +# 1:0x148:8:0 +# 2:0x150:8:0 +# +# Instructions: + + + blockDimX : c[0x0][0x8] + blockDimY : c[0x0][0xc] + blockDimZ : c[0x0][0x10] + gridDimX : c[0x0][0x14] + gridDimY : c[0x0][0x18] + gridDimZ : c[0x0][0x1c] + + param_out[0] : c[0x0][0x140] + param_out[1] : c[0x0][0x144] + param_clocks[0] : c[0x0][0x148] + param_clocks[1] : c[0x0][0x14c] + param_in[0] : c[0x0][0x150] + param_in[1] : c[0x0][0x154] + + + + + 0-1 : out<0-1> + 2-3 : clocks<0-1> + 4-15 : result, result2, tid, bid, blockDim, clock1, clock2, scale, s + 16-24 : a, b, c, x + + + +// Load in our params +--:-:-:-:1 MOV out0, param_out[0]; +--:-:-:-:1 MOV out1, param_out[1]; +--:-:-:-:1 MOV clocks0, param_clocks[0]; +--:-:-:-:1 MOV clocks1, param_clocks[1]; +//--:-:-:-:1 MOV in, c[0x0][0x148]; +--:-:-:-:1 MOV blockDim, blockDimX; + +--:-:-:-:1 PSETP.AND.AND P0, PT, !PT, PT, PT; + +--:-:-:-:6 MOV32I result, 0xffffffff; +--:-:-:-:6 MOV32I result2, 0x0; +--:-:-:-:1 MOV32I a, 1; +--:-:-:-:1 MOV32I b, 1; +--:-:-:-:6 MOV32I c, 0x0; + +// (127 - scale) << 23 +//--:-:-:-:6 MOV32I scale, 28; +//--:-:-:-:6 IADD scale, -scale, 127; +//--:-:-:-:6 SHL scale, scale, 23; + + +//--:-:-:-:6 MOV32I c, 0x4f765432; + +//--:-:1:-:2 LDG.CI.128 a, [in]; + +//01:-:-:-:6 VMAD.S16.S16 result, a, b, c; + +//--:-:-:-:6 MOV result, a; + +// a >> 16 | (b & 0xffff0000) + +//--:-:-:-:6 SHR.U32 result, a, 16; +//--:-:-:-:6 LOP3.LUT result, result, b, c, 0xf8; + +//--:-:-:-:6 I2I.S32.S16 result, a.H1; + +//--:-:-:Y:d IADD result.CC, a, -c; +//--:-:-:Y:2 IADD.X result2, b, -RZ; + +//--:-:-:-:6 SHR result, a, 1; + +//--:-:-:-:6 BFI result, b, 0x1010, a; + +--:-:-:-:1 CS2R clock1, SR_CLOCKLO; + +//--:-:-:-:6 XMAD.S16.S16 c, a, b, RZ; +//--:-:-:-:6 ISET.LT.AND s, c, RZ, PT; +//--:-:-:-:6 IADD result.CC, c, result; +//--:-:-:-:6 IADD.X result2, s, result2; + +//--:-:-:-:6 XMAD.S16.S16 result.CC, a, b, result; +//--:-:-:-:6 IADD.X result2, result2, RZ; + +//--:-:-:-:6 SHF.R.S64 result, result, 1, result2; +//--:-:-:-:6 MOV32I result2, 0; + +--:-:-:-:f LOP.AND.NZ P0, RZ, result, 1; + +--:-:-:-:6 @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result; + +//--:-:1:-:d I2F.F32.S32 result2, a; +//01:-:-:-:6 FMUL result2, result2, scale; +//01:-:2:-:d F2I.S32.F32 result, result2; + +02:-:-:-:6 CS2R clock2, SR_CLOCKLO; + +//F2I = "^$pred?F2I$ftz$x2x$round $r0, $cr20;" +//I2F = "^$pred?I2F$x2x$rnd $r0, $cr20;" +//x2x = "\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)" +//rnd = "(?:\.(?RN|RM|RP|RZ))?" +//round = "(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?" +//r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" +//r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" + + +//--:-:-:-:1 XMAD.MRG x, a, b.H1, RZ; +//--:-:-:-:6 XMAD result, a.H1, b.H1, c; +//--:-:-:-:1 XMAD.PSL.CBCC result, a.H1, x.H1, result; + +// Get the first clock value + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:2 S2R bid, SR_CTAID.X; + + + +// Take the difference of clocks +--:-:-:-:1 IADD clock1, clock2, -clock1; + +// Setup our output addresses +// Stall your pipeline dependencies properly +03:-:-:-:1 XMAD tid, blockDim, bid, tid; +--:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; +--:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; +--:-:-:Y:6 SHL tid, tid, 0x2; + +--:-:-:-:1 IADD clocks, clocks, tid; +--:-:-:-:1 IADD out, out, tid; + +// Output the results. +// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values +--:-:-:-:1 STG.E [clocks], result2; +--:-:-:-:1 STG.E [out], result; +--:-:-:-:5 EXIT; + diff --git a/Assembler/PascalAs/pm_to_blib b/Assembler/PascalAs/pm_to_blib new file mode 100644 index 0000000..e69de29 diff --git a/Assembler/PascalAs/sgemm/batched_gemm.xlsx b/Assembler/PascalAs/sgemm/batched_gemm.xlsx new file mode 100644 index 0000000..c88f0a7 Binary files /dev/null and b/Assembler/PascalAs/sgemm/batched_gemm.xlsx differ diff --git a/Assembler/PascalAs/sgemm/cublas_sgemm.ptx b/Assembler/PascalAs/sgemm/cublas_sgemm.ptx new file mode 100644 index 0000000..8edec86 --- /dev/null +++ b/Assembler/PascalAs/sgemm/cublas_sgemm.ptx @@ -0,0 +1,65 @@ +.version 4.1 +.target sm_50 +.address_size 64 + +// ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx + +// You can use maxas to insert cublas_device.lib code into a cubin built from this ptx: + +// From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib + +// cuobjdump -lelf cublas_device.lib | find "sm_50" + +// cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib + +// maxas -l maxwell_sgemm.asm.sm_50.cubin + +// maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass +// maxas -e -k maxwell_sgemm_128x64_nt maxwell_sgemm_128x64_nt.sass + +// maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin +// maxas -i maxwell_sgemm_128x64_nt.sass cublas_sgemm.cubin + +// The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas. + +.visible .entry maxwell_sgemm_128x128_nt( + .param .u64 .ptr.global.align 8 param_A, + .param .u64 .ptr.global.align 8 param_B, + .param .u64 .ptr.global.align 8 param_C, + .param .s32 param_lda, + .param .s32 param_ldb, + .param .s32 param_ldc, + .param .s32 param_k, + .param .u64 .ptr.global.align 8 param_Alpha, + .param .u64 .ptr.global.align 8 param_Beta, + .param .s32 param_alpha, + .param .s32 param_beta, + .param .s32 param_flag +) +.reqntid 256 +{ + .shared .align 16 .b8 share[16384]; + + ret; +} + +.visible .entry maxwell_sgemm_128x64_nt( + .param .u64 .ptr.global.align 8 param_A, + .param .u64 .ptr.global.align 8 param_B, + .param .u64 .ptr.global.align 8 param_C, + .param .s32 param_lda, + .param .s32 param_ldb, + .param .s32 param_ldc, + .param .s32 param_k, + .param .u64 .ptr.global.align 8 param_Alpha, + .param .u64 .ptr.global.align 8 param_Beta, + .param .s32 param_alpha, + .param .s32 param_beta, + .param .s32 param_flag +) +.reqntid 128 +{ + .shared .align 16 .b8 share[12288]; + + ret; +} diff --git a/Assembler/PascalAs/sgemm/new.cubin b/Assembler/PascalAs/sgemm/new.cubin new file mode 100644 index 0000000..6a1572b Binary files /dev/null and b/Assembler/PascalAs/sgemm/new.cubin differ diff --git a/Assembler/PascalAs/sgemm/sgemm.cpp b/Assembler/PascalAs/sgemm/sgemm.cpp new file mode 100644 index 0000000..f2127d8 --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm.cpp @@ -0,0 +1,480 @@ +// sgemm.cpp : Defines the entry point for the console application. +// + +#include +#include +#include +#include +#include +#include + +CUcontext hContext = 0; +cublasHandle_t hCublas = 0; + +float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat = 1, int printVars = 0); +float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat); +void gflops(const char* ident, int N, float ms, int repeat); +void test(float* C, float* T, int N, size_t size); + +#define REPEAT_BLOCK 2000 + +#define CUDA_CHECK( fn ) do { \ + CUresult status = (fn); \ + if ( CUDA_SUCCESS != status ) { \ + const char* errstr; \ + cuGetErrorString(status, &errstr); \ + printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \ + if (hCublas) cublasDestroy(hCublas); \ + if (hContext) cuCtxDestroy(hContext); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUBLAS_CHECK( fn ) do { \ + cublasStatus_t status = (fn); \ + if ( CUBLAS_STATUS_SUCCESS != status ) { \ + printf("Cublas Failure (line %d of file %s):\n\t%s returned %d\n", __LINE__, __FILE__, #fn, status); \ + if (hCublas) cublasDestroy(hCublas); \ + if (hContext) cuCtxDestroy(hContext); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +int main(int argc, char* argv[]) +{ + char deviceName[32]; + int count, ordinal, major, minor; + CUdevice hDevice; + CUevent hStart, hStop; + CUdeviceptr devA, devB, devC, devT, otherDevA, otherDevB; + + // Initialize the Driver API and find a device + CUDA_CHECK( cuInit(0) ); + CUDA_CHECK( cuDeviceGetCount(&count) ); + for (ordinal = 0; ordinal < count; ordinal++) + { + CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); + CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); + CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); + CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); + if (major >= 5 && minor >= 2) + { + //printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); + break; + } + } + if (ordinal == count) + { + printf("No compute 5.0 device found, exiting.\n"); + exit(EXIT_FAILURE); + } + + // First command line arg is the size of N divided by 128 + int thread128 = 64; + if (argc > 1) + thread128 = atoi(argv[1]); + if (thread128 > 64 || thread128 < 1) + thread128 = 64; + + // Second command line arg is the repeat count for benchmarking + int repeat = 1; + if (argc > 2) + repeat = atoi(argv[2]); + if (repeat > 10000 || repeat < 1) + repeat = 1; + + // Third command line arg is the normalized float size + CUarray_format format = CU_AD_FORMAT_FLOAT; + if (argc > 3) + format = (CUarray_format)atoi(argv[3]); + if (format != CU_AD_FORMAT_FLOAT && format != CU_AD_FORMAT_UNSIGNED_INT16 && format != CU_AD_FORMAT_UNSIGNED_INT8) + format = CU_AD_FORMAT_FLOAT; + + // Forth command line arg is for printf debugging + int printVars = 0; + if (argc > 4) + printVars = atoi(argv[4]); + if (printVars > 100 || printVars < 1) + printVars = 0; + + int N = thread128 * 128; + float alpha = 1, beta = 0, ms = 1; + size_t sizeOther = N * N; + size_t sizeFloat = sizeOther * 4; + + float* A = (float*)malloc(sizeFloat); + float* B = (float*)malloc(sizeFloat); + float* C = (float*)malloc(sizeFloat); + float* T = (float*)malloc(sizeFloat); + float *otherA, *otherB; + + //int counter = 0; + //srand((unsigned int)time(0)); + for(int i = 0; i < N * N; i++) // + { + //A[i] = (float)rand() / (float)RAND_MAX; + //B[i] = (float)rand() / (float)RAND_MAX; + A[i] = B[i] = 1.0f; // * (i & 3) + 1.0f; + //A[i] = 1.0f; + //B[i * N + counter++] = 1.0f; // identity matrix + } + + if (format == CU_AD_FORMAT_FLOAT) + { + sizeOther *= 4; + otherA = A; + otherB = B; + } + else if (format == CU_AD_FORMAT_UNSIGNED_INT16) + { + sizeOther *= 2; + unsigned short* othera = (unsigned short*)malloc(sizeOther); + unsigned short* otherb = (unsigned short*)malloc(sizeOther); + for(int i = 0; i < N * N; i++) + othera[i] = otherb[i] = 65535; + + otherA = reinterpret_cast(othera); + otherB = reinterpret_cast(otherb); + } + else // (format == CU_AD_FORMAT_UNSIGNED_INT8) + { + otherA = (float*)malloc(sizeOther); + otherB = (float*)malloc(sizeOther); + memset(otherA, 255, sizeOther); + memset(otherB, 255, sizeOther); + } + + CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); + //CUBLAS_CHECK( cublasCreate(&hCublas) ); + + CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT + CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); + + CUDA_CHECK( cuMemAlloc(&devA, sizeFloat) ); + CUDA_CHECK( cuMemAlloc(&devB, sizeFloat) ); + CUDA_CHECK( cuMemAlloc(&devC, sizeFloat) ); + CUDA_CHECK( cuMemAlloc(&devT, sizeFloat) ); + + CUDA_CHECK( cuMemcpyHtoD(devA, A, sizeFloat) ); + CUDA_CHECK( cuMemcpyHtoD(devB, B, sizeFloat) ); + CUDA_CHECK( cuMemsetD8(devC, 0, sizeFloat) ); + CUDA_CHECK( cuMemsetD8(devT, 0, sizeFloat) ); + + if (format == CU_AD_FORMAT_FLOAT) + { + otherDevA = devA; + otherDevB = devB; + } + else + { + CUDA_CHECK( cuMemAlloc(&otherDevA, sizeOther) ); + CUDA_CHECK( cuMemAlloc(&otherDevB, sizeOther) ); + CUDA_CHECK( cuMemcpyHtoD(otherDevA, otherA, sizeOther) ); + CUDA_CHECK( cuMemcpyHtoD(otherDevB, otherB, sizeOther) ); + } + + // Warm up the clock (unless under nsight) + //if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER + // for (int i = 0; i < 3; i++) + // CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast(devA), N, reinterpret_cast(devB), N, &beta, reinterpret_cast(devT), N) ); + + // Launch our kernel + ms = assemblySgemm("sgemm_kernel_64", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars); + gflops("Max64 ", N, ms, repeat); + + ms = assemblySgemm("sgemm_kernel_128", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars); + gflops("Max128", N, ms, repeat); + + //ms = cublasSgemm("maxwell_sgemm_128x64_nt", devT, devA, devB, N, hStart, hStop, repeat); + //gflops("Cub64 ", N, ms, repeat); + + //ms = cublasSgemm("maxwell_sgemm_128x128_nt", devT, devA, devB, N, hStart, hStop, repeat); + //gflops("Cub128", N, ms, repeat); + + // Run cublas again for the same repeat count for comparison + //CUDA_CHECK( cuEventRecord(hStart, NULL) ); + //for (int i = 0; i < repeat; i++) + // CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast(devA), N, reinterpret_cast(devB), N, &beta, reinterpret_cast(devT), N) ); + //CUDA_CHECK( cuEventRecord(hStop, NULL) ); + //CUDA_CHECK( cuEventSynchronize(hStop) ); + //CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); + //gflops("Cublas", N, ms, repeat); + + // Get back our results from each kernel + CUDA_CHECK( cuMemcpyDtoH(C, devC, sizeFloat) ); + CUDA_CHECK( cuMemcpyDtoH(T, devT, sizeFloat) ); + + // Cleanup and shutdown of cuda + CUDA_CHECK( cuMemFree(devA) ); + CUDA_CHECK( cuMemFree(devB) ); + CUDA_CHECK( cuMemFree(devC) ); + CUDA_CHECK( cuMemFree(devT) ); + if (format != CU_AD_FORMAT_FLOAT) + { + CUDA_CHECK( cuMemFree(otherDevA) ); + CUDA_CHECK( cuMemFree(otherDevB) ); + } + + CUDA_CHECK( cuEventDestroy(hStart) ); + CUDA_CHECK( cuEventDestroy(hStop) ); + + //CUBLAS_CHECK( cublasDestroy(hCublas) ); + //hCublas = 0; + CUDA_CHECK( cuCtxDestroy(hContext) ); + hContext = 0; + + // compare C and T for accuracy + test(C, T, N, sizeFloat); + + // And free up host memory + free(A); free(B); free(C); free(T); + + if (format != CU_AD_FORMAT_FLOAT) + { + free(otherA); + free(otherB); + } + + return 0; +} + +// Our kernel wrapper function +float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat, int printVars) +{ + // Configure our x and y grid dimensions (assume nice square matrixes). + // Each block gets 128 tracks from A and 128 tracks from B. + // Each of the 256 threads calculates 64 elements of that 128x128 sub matrix of C. + // See Figure 2 here to get the gist of things (we use a different mapping to maximize LDS.128 usage): + // http://icl.cs.utk.edu/projectsfiles/magma/pubs/fermi_gemm.pdf + + int threads, width; + if (strcmp(kernel, "sgemm_kernel_64") == 0) + { + threads = 64; + width = 64; + } + else + { + threads = 256; + width = 128; + } + + int gridDimXY = N / width + (N % width != 0); + int blocks = gridDimXY * gridDimXY; + + // Setup out debug printf output buffer + CUdeviceptr devD = NULL; + int* D = NULL; + int sizeD = 0; + + if (printVars) + { + sizeD = blocks * threads * printVars * sizeof(int); + D = (int*)malloc(sizeD); + + CUDA_CHECK( cuMemAlloc(&devD, sizeD) ); + CUDA_CHECK( cuMemsetD8(devD, 0, sizeD) ); + } + + // Load the cubin + CUmodule hModule; + CUDA_CHECK( cuModuleLoad(&hModule, "sgemm.cubin") ); + + // Load the textures + CUtexref texA, texB; + CUDA_CHECK( cuModuleGetTexRef(&texA, hModule, "texA") ); + CUDA_CHECK( cuModuleGetTexRef(&texB, hModule, "texB") ); + + // Configure the textures + CUDA_CHECK( cuTexRefSetFormat(texA, format, 4) ); + CUDA_CHECK( cuTexRefSetFormat(texB, format, 4) ); + + CUDA_CHECK( cuTexRefSetAddress(NULL, texA, devA, size) ); + CUDA_CHECK( cuTexRefSetAddress(NULL, texB, devB, size) ); + + // Load the kernel function + CUfunction hKernel; + CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) ); + + // Setup the params + float alpha = 1.0f; + void* params[] = { &devC, &N, &N, &N, &N, &N, &N, &alpha, &devD }; + + float totalTime = 0; + // Launch the kernel repeat times.. but break it up into pieces so as not to lock things up. + while (repeat > 0) + { + float ms; + int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat; + CUDA_CHECK( cuEventRecord( hStart, NULL ) ); + + for (int i = 0; i < r; i++) + CUDA_CHECK( cuLaunchKernel(hKernel, gridDimXY, gridDimXY, 1, threads, 1, 1, 0, 0, params, 0) ); + + CUDA_CHECK( cuEventRecord( hStop, NULL ) ); + CUDA_CHECK( cuEventSynchronize( hStop ) ); + CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) ); + totalTime += ms; + repeat -= r; + } + + + CUDA_CHECK( cuModuleUnload(hModule) ); + + // And here we print out the debug info if requested: + if (printVars) + { + CUDA_CHECK( cuMemcpyDtoH(D, devD, sizeD) ); + CUDA_CHECK( cuMemFree(devD) ); + int *iD = D; + float *fD = reinterpret_cast(D); + unsigned int *uD = reinterpret_cast(D); + + for (int by = 0; by < gridDimXY; by++) + { + for (int bx = 0; bx < gridDimXY; bx++) + { + unsigned int clock = 0xffffffff, sm = 0; + + for (int tid = 0; tid < threads; tid++) + { + //printf("by: %3d, bx: %3d, tid:%3d, rA:%5d, rB:%5d, wr:%5d, rd:%5d, cx:%5d, cy:%5d, ci:%5d, c:%.2f\n", + //printf("by: %3d, bx: %3d, tid:%3d, t0:%5d, end:%5d, k:%5d, tid2:%5d, tid15:%5d, ldx:%5d, t2:%5d, t4:%5d\n", + // by, bx, tid, iD[0], iD[1], iD[2], iD[3], iD[4], iD[5], iD[6], iD[7] + //); + if (uD[1] < clock) clock = uD[1]; + sm = uD[0]; + + iD += printVars; + fD += printVars; + uD += printVars; + } + printf("%02d %08u %d %d\n", sm, clock, by, bx); + } + } + free(D); + } + + return totalTime; +} + +typedef struct dPointer +{ + CUdeviceptr lo; + CUdeviceptr hi; +} dPointer; + +float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat) +{ + int threads, gridX, gridY; + if (strcmp(kernel, "maxwell_sgemm_128x64_nt") == 0) + { + threads = 128; + gridX = N / 128 + (N % 128 != 0); + gridY = N / 64 + (N % 64 != 0); + } + else + { + threads = 256; + gridX = gridY = N / 128 + (N % 128 != 0); + } + int blocks = gridX * gridY; + + // Load the cubin + // See cublas_sgemm.ptx for info on how to build this. + CUmodule hModule; + CUDA_CHECK( cuModuleLoad(&hModule, "cublas_sgemm.cubin") ); + + // Load the kernel function + CUfunction hKernel; + CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) ); + + // Setup the params + // I should probably be working in 64 bits... + dPointer dA = { devA, 0 }; + dPointer dB = { devB, 0 }; + dPointer dC = { devC, 0 }; + + int flag = 0; + float alpha = 1.0; + float beta = 0.0; + + void* params[] = { &dA, &dB, &dC, &N, &N, &N, &N, &dA, &dA, &alpha, &beta, &flag }; + + float totalTime = 0; + // Launch the kernel repeat times.. but break it up into pieces so as not to lock things up. + while (repeat > 0) + { + float ms; + int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat; + CUDA_CHECK( cuEventRecord( hStart, NULL ) ); + + for (int i = 0; i < r; i++) + CUDA_CHECK( cuLaunchKernel(hKernel, gridX, gridY, 1, threads, 1, 1, 0, 0, params, 0) ); + + CUDA_CHECK( cuEventRecord( hStop, NULL ) ); + CUDA_CHECK( cuEventSynchronize( hStop ) ); + CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) ); + totalTime += ms; + repeat -= r; + } + + + CUDA_CHECK( cuModuleUnload(hModule) ); + + return totalTime; +} + +void gflops(const char* ident, int N, float ms, int repeat) +{ + // Standard sgemm flops formula + ms /= repeat; + printf("%s GFLOPS: %.2f (size: %d, iterations: %d)\n", ident, ((double)N * N * N * 2.0 + N * N) / (ms * 1000000.0), N, repeat); +} + +void test(float* C, float* T, int N, size_t size) +{ + // Compare our implementation with the cublas result + int errors = memcmp(C, T, size); + if (errors) + { + if (N <= 512) // This gets too big and slow for large N + { + errors = 0; + FILE* file; + if (fopen_s(&file, "data.txt", "w") == 0) + { + for (int y = 0; y < N; ++y) + { + for (int x = 0; x < N; ++x) + { + float c = C[x*N + y]; + float t = T[x*N + y]; + if (c != t) + { + errors++; + fprintf(file, "%.8f!%.8f\t", c , t); + //fprintf(file, "%.0f!", c); + //fprintf(file, "!"); + } + else + { + //fprintf(file, "%.0f=%.0f\t", c , t); + //fprintf(file, "%.0f=", c); + fprintf(file, "="); + } + } + fprintf(file, "\n"); + } + fclose(file); + printf("%d errors\n", errors); + } + else + { printf("Cannot open data.txt for writing\n"); } + } + else + { printf("%d errors\n", errors); } + } + else + { printf("%d errors\n", errors); } +} \ No newline at end of file diff --git a/Assembler/PascalAs/sgemm/sgemm.cu b/Assembler/PascalAs/sgemm/sgemm.cu new file mode 100644 index 0000000..ce8b2a6 --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm.cu @@ -0,0 +1,105 @@ + +// Note this file isn't configured to automatically compile. +// Here's how: + +// If you want to look at the ptx first: +// nvcc -arch sm_50 -m 32 -ptx sgemm.cu + +// Manually compile your kernel to a cubin. +// You should only have to do this once, unless you change params or shared size or globals: +// nvcc -arch sm_50 -m 32 -cubin sgemm.cu + +// If tweaking a kernel or writing a new one based on this shell code you would then do this: +// maxas.pl -e kernel.cubin kernel.sass + +// I've already included a modified kernel (sgemm.sass) so the next step is.. + +// Splice the manually assembled code back into the cubin: +// maxas.pl -i sgemm.sass sgemm.cubin + +#include +#include +#include +#include + +typedef texture floatTex; + +floatTex texA(0, cudaFilterModePoint, cudaAddressModeBorder); +floatTex texB(0, cudaFilterModePoint, cudaAddressModeBorder); + +// Use extern C so C++ doesn't mangle our kernel name +extern "C" +// This kernel requires 256x1x1 threads per block +__global__ void __launch_bounds__(256) sgemm_kernel_128( + float *C, + const int m, const int n, const int k, + const int lda, const int ldb, const int ldc, + float alpha, int *D) +{ + // Declare any shared memory your kernel requires + // Or you could just pass the amount in as a param to cuLaunchKernel + __shared__ float4 share[1024]; + + int tid = threadIdx.x; + + // If you use indirect texture references, they will be passed as params at the end of the param list + // So set that up here to make sure they're available in your kernel + floatTex tex = tid > 127 ? texB : texA; + + // Make use of shared and your textures so it doesn't get optimized away + share[tid] = tex1Dfetch(tex, tid); + + __syncthreads(); + + // output something so your setup isn't optimized away. + C[tid] = share[255-tid].x; +} + +extern "C" +__global__ void __launch_bounds__(64) sgemm_kernel_64( + float *C, + const int m, const int n, const int k, + const int lda, const int ldb, const int ldc, + float alpha, int *D) +{ + __shared__ float4 share[512]; + + int tid = threadIdx.x; + + floatTex tex = tid > 127 ? texB : texA; + + share[tid] = tex1Dfetch(tex, tid); + + __syncthreads(); + + C[tid] = share[255-tid].x; +} + +// A note about using the Cuda Runtime. +// If that's your preference over the driver API then here's what you'd do: + +// In your project properties in the Cuda C/C++ panel: +// -Set the "Keep Processed Files" (-keep) option +// -Add a -v manually to the command line +// If compiling on command line just add -keep -v options to nvcc. +// Rebuild your solution and look in the log for these lines that follow the ptxas step: + +// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda +// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" +// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" + +// You just need to manually run these 3 commands (or add them to a build script) +// after you've modified the cubin generated from the preceeding ptxas command. +// That will give you a new .cu.obj file which will automatically be linked in for you next time you +// build your project (or you could manually run the linker step as well). + +// Having done that you can call your kernel normally using the <<< >>> syntax. +// Debugging will have to be with the sass syntax but that's what you'll want to see anyway. +// With fatbin you can also keep non-maxwell optimized versions of your code. + + +// I just discovered this also works as a shortcut to the above: +// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu + +// The cu kernel definitions above need to have empty bodies. +// And, the cu file must be compiled to a lib seperately before linking. \ No newline at end of file diff --git a/Assembler/PascalAs/sgemm/sgemm.cubin b/Assembler/PascalAs/sgemm/sgemm.cubin new file mode 100644 index 0000000..946c7d7 Binary files /dev/null and b/Assembler/PascalAs/sgemm/sgemm.cubin differ diff --git a/Assembler/PascalAs/sgemm/sgemm.pl b/Assembler/PascalAs/sgemm/sgemm.pl new file mode 100644 index 0000000..9b1661b --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm.pl @@ -0,0 +1,102 @@ +#!/usr/bin/perl +use strict; + +my $CU_AD_FORMAT_UNSIGNED_INT8 = 0x01; +my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02; +my $CU_AD_FORMAT_FLOAT = 0x20; + +if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9]) +{ + print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`; + exit if $?; + print `maxas.pl -i sgemm128.sass sgemm.cubin`; + exit if $?; + print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`; +} +if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9]) +{ + print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`; + exit if $?; + print `maxas.pl -i sgemm64.sass sgemm.cubin`; + exit if $?; + print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`; +} + +#print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2); + +`Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`; + +print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`; +exit; + +my %data; +foreach my $thread128 (4 .. 64) +{ + my $N = $thread128 * 128; + + my $iterations = int(20 * (64 * 128)**3 / $N**3); + $iterations = 10000 if $iterations > 10000; + + print "$N $iterations\n"; + + my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`; + + foreach my $bench (split "\n", $data) + { + if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /) + { + push @{$data{$N}}, $2; + print "$1 $2\n"; + } + } +} +print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n"; + +foreach my $N (sort { $a <=> $b } keys %data) +{ + print join("\t", @{$data{$N}}), "\n"; +} + + +#print $data; + +__END__ + + +64 * 128 * 16 * 1.620 * .931 / 520 + +Max64 GFLOPS: 1377.38 (size: 256, iterations: 2000) +Max128 GFLOPS: 973.70 (size: 256, iterations: 2000) +Cub64 GFLOPS: 1272.42 (size: 256, iterations: 2000) +Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000) + +my @data = grep /\S/, split "\n", $data; + +my $min; +my %smData; +my @sdata; +foreach (@data) +{ + next if /GFLOPS/; + + my ($sm, $clock, $by, $bx) = split /\s+/; + + $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm}; + + $min = $clock if !$min || $clock < $min; + + push @sdata, [$sm, $clock, $by, $bx]; +} + +foreach (@sdata) +{ + $_->[1] -= $smData{$_->[0]}; +} + +foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata) +{ + printf "%02d %8u by: %2d bx: %2d\n", @$_; + +} + + diff --git a/Assembler/PascalAs/sgemm/sgemm.sln b/Assembler/PascalAs/sgemm/sgemm.sln new file mode 100644 index 0000000..bcbee09 --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32 + {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32 + {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32 + {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Assembler/PascalAs/sgemm/sgemm.vcxproj b/Assembler/PascalAs/sgemm/sgemm.vcxproj new file mode 100644 index 0000000..6d28ced --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm.vcxproj @@ -0,0 +1,92 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + {D571379D-3653-43CB-BE83-A6C68D392A05} + Win32Proj + sgemm + + + + Application + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + true + + + false + + + + + + Level3 + Disabled + _CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) + + + Console + true + $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) + cuda.lib;cublas.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + _CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) + cuda.lib;cublas.lib;%(AdditionalDependencies) + + + + + + + + + + + + + \ No newline at end of file diff --git a/Assembler/PascalAs/sgemm/sgemm128.sass b/Assembler/PascalAs/sgemm/sgemm128.sass new file mode 100644 index 0000000..038d2f3 --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm128.sass @@ -0,0 +1,613 @@ +# Kernel: sgemm_kernel_128 +# +# SharedSize: 16384 +# Params(8): +# 0:0x140:4:4 param_C, +# 1:0x144:4:0 param_m, +# 2:0x148:4:0 param_n, +# 3:0x14c:4:0 param_k, +# 4:0x150:4:0 param_lda, +# 5:0x154:4:0 param_ldb, +# 6:0x158:4:0 param_ldc +# 7:0x15c:4:0 param_alpha +# 8:0x160:4:4 param_D // for diagnostic printf output +# +# Globals: +# c[0x0][0x164]: texA (the value is 1) +# c[0x0][0x168]: texB (the value is 0) + + + + // Temporary registers to calculate the state registers. Reuse the C output registers. + // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts. + 0-63 ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy + + // Aliases for the C registers we use for initializing C (used as vectors) + 0-63 : cz<00-63> + + // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers + 80 : zOffset + + // 64 C maxtrix output registers. + // Use special mapping to avoid register bank conflicts between these registers and the blocking registers. + 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> + 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> + 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> + 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> + 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> + + // Double buffered register blocking used in vector loads. + // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags + 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> + 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> + + // Registers to load A or B + 96-103 : loadX<0-7> + + // Key global state registers for main loop and some we reuse for outputing C. + // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of + // delayed bank conflicts between memory operations and ffmas. + // The array index bracket notation can be used to request a bank in a dynamically allocated range. + 104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs + + // Registers to store the results back to global memory. Reuse any register not needed after the main loop. + // Statically allocate cs0-7 because they're vector registers. + 64-71 : cs<0-7> + + // dynamically allocated C output registers(~) + 72-103 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX + + + +// Note the absense of the loading of the stack pointer into R1. +// No idea why ptxas does that anyway when it's not used for register spilling. +// Such a waste of a perfectly good register. + +// Scheduler doesn't handle the dependency flags yet, +// so move these first instructions outside the block that's auto scheduled +//--:-:-:-:1 CS2R clock, SR_CLOCKLO; +//--:-:-:-:1 S2R smId, SR_VIRTID; +//--:-:-:-:1 S2R nSMs, SR_VIRTCFG; +--:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 +--:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 +--:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 + +// Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies +// Memory dependencies are left up to the auther to deal with manually for now. + + +// First 128 threads load A to shared, 2nd 128 loads B to shared +// Note this technique is not possible in cuda or ptx as there's no way to +// efficiently specify a warp-uniform predicate for a memory op. +// Compile sgemm.cu and inspect the sass to see what I'm talking about. + +// blk = tid >= 128 ? by : bx; +// ldx = tid >= 128 ? ldb : lda; +// tex = tid >= 128 ? texB : texA; +01:-:-:Y:1 ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1 +06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 +--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150]; +--:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; +--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA +--:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB + +// Initialize the portion of shared we use to zero our C registers +// Give each warp its own address to write to. +// All threads write to the same address, but we don't care because only one needs to take. +// There is no bank conflict on writing to the same address, just indeterminacy in which thread will get its value stored. +--:-:-:-:1 LOP.AND zOffset, tid, -32; +--:-:-:-:1 STS.128 [zOffset + 4x<16*128>], RZ; + +// tid4 = (tid >> 5) & 3 +// tid31 = tid & 31 +// tid96 = tid & 96 +// tid128 = tid & 128 +--:-:-:-:1 BFE.U32 tid4, tid, 0x205; // 2 bits at position 5 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 LOP.AND tid128, tid, 128; + +// ldx4 = ldx * 4; +// ldx8 = ldx * 8; +--:-:-:-:1 SHR.U32 ldx, ldx4, 2; +--:-:-:-:1 IADD ldx8, ldx4, ldx4; + +// track0 = blk*128/4 + tid31 + (ldx * tid4) +--:-:-:-:1 ISCADD track0, blk, tid31, 5; +--:-:-:-:1 XMAD.LO track0, ldx, tid4, track0, xmad_t0; // XMAD.LO is a macro that is expanded out into the 3 XMADs +--:-:-:-:1 IADD track4, track0, ldx4; + +// writeS = tid31*4*4 + tid4*128*4 +// writeS += 4096 if tid >= 128 +--:-:-:-:1 SHL tid31_4, tid31, 4; +--:-:-:-:1 ISCADD writeS, tid4, tid31_4, 9; +--:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*128>; + +// int end = track0 + (k-8)*ldx; +--:-:-:-:1 MOV k, c[0x0][0x14c]; +--:-:-:-:1 IADD k, k, -8; +--:-:-:-:1 XMAD.LO end, k, ldx, track0, xmad_end; + +// readAs and readBs are carefully constructed to avoid any bank conflicts while loading from shared +// readAs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readAs, tid128, 4; +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readBs, tid, 0x70; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<8*128>, 4; + +// Preload the first 8 lines from texture memory +// Keep these instructions in this order (but allow others to interleave). +// Normally the scheduler tries to preserve source order by default, but this demonstrates how you enforce +// an ordering if you need to. +// Note: these are the 4 element vector load versions (last param: 0xf=vec4, 0x3=vec2, 0x1=single) + +--:-:1:-:1 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:2:-:1 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2 + + + + +// Initialize C registeres to zero +// Using LDS.U.128 is a neat trick to save a few clock cyles +// (when you have enough warps to hide the latency.) + + return join '', map sprintf("--:-:3:-:1 LDS.U.128 cz%02d, [zOffset + 4x<16*128>];\n", $_ * 4), 0..15; + + +// These instuctions need to occur after the textures load so put them in a new block +// that starts with a dependency barrier wait. + + +01:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1 +02:-:-:-:1 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2 + +// Increment tracks after the loads are complete to avoid needing write-after-read dependencies +--:-:-:-:1 IADD track0, track0, ldx8; +--:-:-:-:1 IADD track4, track4, ldx8; + +// Wait for all threads to finish loading shared +04:-:-:-:5 BAR.SYNC 0; + + + +// The next store to shared goes to high area. +// Having 2 share buffers allows us to eliminate a bar.sync in the main loop. +// This way we don't have to wait for all threads to arrive before writing fresh data to shared. +// Other threads can continue reading from the last batch while the new data is being written. +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*128>; + +// Preload the fist lines of A and B from shared +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1 + + +// The main loop +// While calculating the first line, load in the next line from shared. +// Shared memory stores enough to do this 8 times per loop. +// Also pull in the next block of memory from global and store it to shared. + +// Efficiency: +// ffma: 512 +// lds: 32 dual issued +// sts: 2 dual issued +// tex: 2 dual issued +// add: 2 +// xor: 3 +// setp: 1 +// bar: 1 dual issued +// bra: 1 dual issued +// Total: 524 (512/518 = 98.8% FFMA) + +// Memory Throughput Upper Bound: +// 2 * 4 * 4 bytes per thread per 518 clocks +// 128 threads per SM +// 16 SM's (GM204) +// 1640Mhz (boost overclock) +// .931 GiB/GB (1000^3 / 1024^3) +// 193 GiB/sec +// Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz) + +LOOP: + +// Loop end condition +--:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; + + + + # We eliminated bank conflicts with our C registers and the blocking registers, + # but there are still 16 bank conflicts between the blocking registers themselves. + # By ordering the FFMA's in a swirling zigzag pattern we can completely hide those conflicts + # behind register reuse. This pattern also maximizes that reuse (47%) and minimizes the bandwidth + # out of the register bank, thereby reducing power consumption and allowing the chip to + # stay at a higher sustained clock speed. One other constraint is that we want each successive + # instruction to pull its third operand from alternating banks. We space the swirl by 2 in the x + # direction to achieve this. This has the effect of making it easier to avoid delayed bank conflicts + # with the memory operations. Finally, for the very first ffma, don't choose one of the 16 bank conflicts + # as we have no way of hiding that conflict behind a reuse (cublas makes this mistake). + + # Alternating banks (1320 Hz, full speed) + my @swirl = ([2,0],[2,1],[0,1],[0,0]); + my @xVals = (0,1,64,65); + + # Repeating banks (1320Hz, 83 Gflops slower, but lower power draw probably becuase of increased stalls) + # Only explanation I can think of is increased delayed register bank conflicts with memory ops. + #my @swirl = ([0,1],[0,0],[1,0],[1,1]); + #my @xVals = (0,2,64,66); + + my @cOrder; + foreach my $y (0,2,64,66) + { + # apply the swirl + foreach my $x (@xVals) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + # apply the zigzag + @xVals = reverse @xVals; + } + + # This ordering (a simple zigzag) eliminates the bank conflicts but only achieves 39% reuse. + # It runs 20 GFlops slower since the register bank draws more power and the clock slows down to 1306 Hz. + # There may be more delayed bank conflicts with memory operations as the slowdown is 4 Glops more than + # the reduced clock accounts for. + #my @cOrder2; + #my @xVals = (0..3,64..67); + #foreach my $y (0..3,64..67) + #{ + # @xVals = reverse @xVals; + # push @cOrder2, [$_, $y] foreach @xVals; + #} + #@cOrder = @cOrder2; + + my %insert = + ( + # Don't start the first TLD before 12 to let ISETP to write P0 + # These global reads and shared writes we put exactly in the middle of the LDS ops + # This is to not overwhelm the memory units with instructions (and because these were tested faster here). + # The 4 spacing seems to work best for vec4 instructions. + # It's odd that these two textures loads can drive 512 FFMA's all by themselves.. but 256 threads can load 8 128 F32 wide lines. + # So we only need 2 to get 8 lines from both matrices. + + j0c31 => "--:-:2:-:1 \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2\n", + j0c33 => "--:-:3:-:1 \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3\n", + + j6c30 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2\n", + j6c34 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3\n", + + # We need one barrier in the main loop after writing shared memory. + # The barrier is needed even if this is our last loop because we need to protect the warp shuffle step. + # Note, BAR.SYNCs do not sync memory read access automatically, you still need to flag the barriers (writes are sync'd). + # After the BAR, swap our share buffer location. We don't need an additional barrier because of these swaps. + # Note, this doubles our shared memory usage but this kernel's occupancy is entirely bound by registers. + # LOP.XOR readAs needs to be 4 clocks prior to the LDS.U.128 for readAs (but push this as far down as possible) + j6c62 => + "01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<16*128>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<16*128>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<16*128>;\n", + + # Note having 2 IADDs slightly hits our FFMA performance (1/518 = .2%), but TLD doesn't take an offset. + # LDG.CI doesn't have this issue, but doesn't give you the nice features of texture loads: + # -Boundry Clamping: simplifies our matrix load logic so we don't need to worry about loading out of bounds + # -Normalized Floats: if we don't need full 32 bits of precision we could store our matrices using 16 or 8 bit values + j7c63 => + "--:-:-:-:1 \@P0 IADD track0, track0, ldx8;\n" . + "--:-:-:-:0 \@P0 IADD track4, track4, ldx8;\n" . + "--:-:-:Y:5 \@P0 BRA LOOP;\n", + ); + + my $out; + # We unroll our main loop 8 iterations. + # This gives us a loop instruction count of 556. Add the control instructions and that makes it 741 opcodes sized 8 bytes. + # This is 5928 bytes, nicely fitting inside the 8kb instruction cache. Going to the next biggest size would be 12 lines. + # That would be 768 ffmas and not leaving enough room for the other instructions and control codes. + # So by staying inside the instruction cache size, we avoid hitting any instruction fetch latencies. + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + # Our rolling blocking registers stay one load ahead off the FFMA's (rs: read share) + my $rsOffset = ($j + 1) % 8; + # No need to load on last loop iteration + my $rsPred = $j == 7 ? '@P0' : ' '; + + # You can experiment here with different vector load sizes + my $vec = 128; + + if ($vec == 128) + { + # Roll up our LDS ops here to keep them easier to manage and tune + # Space at every other clock to maximize throughput. + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBy64, [readBs + 4x<%d*128 + 64>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; + } + elsif ($vec == 64) + { + # LDS.64 runs about 22 Gflops slower than LDS.128 (GM107). Not a huge difference since our latencies are so well hidden. + # I think LDS.128 is implemented internally as a pair of LDS.64 ops which could be another reason for the comparable performance. + # I think the big benefit with 128 is being able to issue all our LDS ops earlier, allowing more FFMA's prior to reading out the results. + # There could also be additional opportunity for delayed bank conflicts. + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c8"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c10"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c12"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c14"} = sprintf "--:-:1:-:1 %s LDS.U.64 j%dBy66, [readBs + 4x<%d*128 + 66>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; + } + else + { + # This one drops performance by over 200 Gflops. So you want to at least use LDS.64 if you can. + # We don't even have room to properly space these at half throuput. + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c1"} = sprintf "--:-:-:-:1 %s LDS j%dAx01, [readAs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c3"} = sprintf "--:-:-:-:1 %s LDS j%dAx03, [readAs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c5"} = sprintf "--:-:-:-:1 %s LDS j%dBy01, [readBs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:-:-:1 %s LDS j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c7"} = sprintf "--:-:-:-:1 %s LDS j%dBy03, [readBs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c8"} = sprintf "--:-:-:-:1 %s LDS j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c9"} = sprintf "--:-:-:-:1 %s LDS j%dAx65, [readAs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c10"} = sprintf "--:-:-:-:1 %s LDS j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c11"} = sprintf "--:-:-:-:1 %s LDS j%dAx67, [readAs + 4x<%d*128 + 67>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c12"} = sprintf "--:-:-:-:1 %s LDS j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c13"} = sprintf "--:-:-:-:1 %s LDS j%dBy65, [readBs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c14"} = sprintf "--:-:-:-:1 %s LDS j%dBy66, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c15"} = sprintf "--:-:1:-:1 %s LDS j%dBy67, [readBs + 4x<%d*128 + 67>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; + } + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + # Grab an instruction for insertion if one exists for this j and c combination + my $ins = $insert{"j${j}c$c"} || ''; + + # Scatter some yields in there to better balance the workload and reduce sync stalls + # Don't pair a yeild with the dual issued ffmas as that kills performance for some reason + ##### This no longer offers extra performance on GM204 as it did on GM107. It still does for the 64 thread version. Keeping since it doesn't hurt. #### + my $yield = $c == 32 ? 'Y' : '-'; + + # The first FFMA needs to wait on the prior loop's LDS.U.128 ops to finish (except if the barrier does the wait for us) + my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--',''); + + # Dual issue these ops + my $stall = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + # output our FFMA and also any inserted ops + $out .= sprintf "%s FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $comment, $ins; + } + } + return $out; + + + +// Main loop is done, time to write C to global memory. + + +// Remove the high bits if present from the last loop's xor. +// Also remove the 4096 added onto readBs. +// This gives us the x and y coordinates of the start of this thread's data in C. +--:-:-:-:1 LOP.AND readAs, readAs, 0xfff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xfff; + +// Remap readAs and readBs onto writeCs so we can shuffle the output for coalesced global writes. +// readAs stays constant, readBs colapses down from stride 4 to 1 +// writeCs = (readBs / 4) * 128 + readAs; +--:-:-:-:1 ISCADD writeCs, readBs, readAs, 5; + +// Read out the C values from shared in a simple tid mapped pattern but +// offset by the position of this warp's colapsed data in shared. + +// cx = tid31 | (tid128 >> 2); +--:-:-:-:1 SHR.U32 cx, tid128, 2; +--:-:-:-:1 LOP.OR cx, tid31, cx; + +// readCs = ((tid96 << 4) | cx) << 2; +--:-:-:-:1 SHL readCs, tid96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, cx; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx += bx*128; +--:-:-:-:1 ISCADD cx, bx, cx, 7; + +// cy = by*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, by, cy00, 7; + +// C += (cy*ldc + cx) * 4; +--:-:-:-:1 MOV ldc, c[0x0][0x158]; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_ci; +--:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; + +// When writing in assembly, being able to 'printf' is sometimes easier than stepping through the debugger. +// Here's how it's done. Drop something like this in your code. Then modify the c code to accept this +// many params per thread to printf (see assemblySgemm function). + +//--:-:-:-:1 SHR.U32 smId, smId, 20; + +// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4 +// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4 +//--:-:-:-:1 MOV gridDimX, c[0x0][0x14]; +//--:-:-:-:1 MOV blckDimX, c[0x0][0x8]; +//--:-:-:-:1 XMAD.LO D, by, gridDimX, bx, xmad_D; +//--:-:-:-:1 XMAD.LO D, D, blckDimX, tid, xmad_D; +//--:-:-:-:1 ISCADD D, D, c[0x0][0x160], 3; // 4 bytes * 2 vars = 8 or shift 3 + +//--:-:-:-:1 STG.CS [D + 4x<0>], readAs; +//--:-:-:-:1 STG.CS [D + 4x<1>], readBs; +//--:-:-:-:1 STG.CS [D + 4x<2>], writeCs; +//--:-:-:-:1 STG.CS [D + 4x<3>], readCs; +//--:-:-:-:1 STG.CS [D + 4x<4>], cx; +//--:-:-:-:1 STG.CS [D + 4x<5>], cy00; +//--:-:-:-:1 STG.CS [D + 4x<6>], ci; +//--:-:-:-:1 STG.CS [D + 4x<7>], cx67y67; + +//--:-:-:-:1 STG.CS [D + 4x<0>], smId; +//--:-:-:-:1 STG.CS [D + 4x<1>], clock; + + +// Setup our matrix bounds checking vars and preds +// Bounds checking is what allows this code to work on matrix sizes not a multiple of 128 +--:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m +--:-:-:-:1 IADD cx, cx, 64; +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m + +--:-:-:-:1 IADD cy00, cy00, -1; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD cy12, cy00, 12; + +// Setup our C output addresses and increments. +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 SHL ldc8, ldc, 5; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + +// Load the first set of the STORE_C subroutine params in the scheduled block. +# This is also a good time to apply alpha. +--:-:-:-:1 MOV alpha, c[0x0][0x15c]; + +--:-:-:-:1 FMUL cs0, cx00y00, alpha; +--:-:-:-:1 FMUL cs1, cx01y00, alpha; +--:-:-:-:1 FMUL cs2, cx02y00, alpha; +--:-:-:-:1 FMUL cs3, cx03y00, alpha; +--:-:-:-:1 FMUL cs4, cx64y00, alpha; +--:-:-:-:1 FMUL cs5, cx65y00, alpha; +--:-:-:-:1 FMUL cs6, cx66y00, alpha; +--:-:-:-:1 FMUL cs7, cx67y00, alpha; + +// We pre-increment the output addresses so they can be dual issued with memory ops +// So start with a -1 instead of 0 value. +--:-:-:-:1 IADD Cy00, Cy00, -ldc1; +--:-:-:-:1 IADD Cy04, Cy00, ldc4; +--:-:-:-:1 IADD Cy08, Cy00, ldc8; +--:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) + + + +// There's nothing yet in place to handle dependecies with subroutines. +// So don't schedule this block. + + + my $out; + foreach my $y (0..3, 64..67) + { + my ($wait, $comment) = $y == 64 ? ('--', '') : ('02',' // Wait Dep 2'); + + # Jump ahead 60 units (to get to the values at y=64) + $out .= + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n\n" . + + "02:-:-:-:1 IADD Cy00, Cy00, ldc60; // Wait Dep 2\n" . + "--:-:-:-:1 IADD Cy04, Cy04, ldc60;\n" . + "--:-:-:-:1 IADD Cy08, Cy08, ldc60;\n" . + "--:-:-:-:1 IADD Cy12, Cy12, ldc60;\n\n" if $y == 64; + + # We need to move the C values to the param registers of the STORE_C subroutine. + # This is also a good time to apply alpha. + $out .= sprintf( + "%s:-:-:-:1 FMUL cs0, cx00y%02d, alpha;%s\n" . + "--:-:-:-:1 FMUL cs1, cx01y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs2, cx02y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs3, cx03y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs4, cx64y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs5, cx65y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs6, cx66y%02d, alpha;\n" . + "--:-:-:-:0 FMUL cs7, cx67y%02d, alpha; // Dual Issue\n", + $wait, $y, $comment, ($y) x 7) if $y; + + # Call the subroutine. + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +// And we'd done. The remainder is the STORE_C subroutine that's defined at the end of the kernel. +--:-:-:-:5 EXIT; + +// This routine does warp synchronous shuffling of our output data so as to be able +// to have coalesced writes to global memory. This is actually faster because the shared +// memory latencies can be hidden by other warps and we're only adding a few extra clocks +// to this thread. Global memory here is the bottleneck and being able to half the needed +// bandwidth at the expense of a few clocks is a modest win. This also keeps power lower +// and our chip running faster. + +// Note, the SHFL instruction doesn't help us here because we're swaping different registers +// from different threads. +STORE_C: + + + +// Each warp writes to its own region of memory so we don't need to bar.sync the access. +// There are some bank conflicts here on the STS.128s but no way to avoid them, and the hit just means a few extra clocks. +// Note here that the scheduler is able to handle the dependencies between vector and non-vector instructions. +// It knows from the instruction type and the register map that cs0 here includes cs1, cs2 and cs3 as well. +--:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], cs4; + +// In a single warp, loads naturally occur after the store to shared completes, no sync required. +--:-:-:-:1 LDS cs0, [readCs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS cs1, [readCs + 4x<0*128 + 64>]; +--:-:-:-:1 LDS cs2, [readCs + 4x<1*128 + 00>]; +--:-:-:-:1 LDS cs3, [readCs + 4x<1*128 + 64>]; +--:-:-:-:1 LDS cs4, [readCs + 4x<2*128 + 00>]; +--:-:-:-:1 LDS cs5, [readCs + 4x<2*128 + 64>]; +--:-:-:-:1 LDS cs6, [readCs + 4x<3*128 + 00>]; +--:-:1:-:1 LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1 + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; + +--:-:-:-:1 IADD Cy00, Cy00, ldc1; +--:-:-:-:1 IADD Cy04, Cy04, ldc1; +--:-:-:-:1 IADD Cy08, Cy08, ldc1; +--:-:-:-:1 IADD Cy12, Cy12, ldc1; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m + +01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 +--:-:-:-:1 @P1 STG.CG [Cy00 + 4x<64>], cs1; +--:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; +--:-:-:-:1 @P3 STG.CG [Cy04 + 4x<64>], cs3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m + +--:-:-:-:1 @P0 STG.CG [Cy08 + 4x<00>], cs4; +--:-:-:-:1 @P1 STG.CG [Cy08 + 4x<64>], cs5; +--:-:-:-:1 @P2 STG.CG [Cy12 + 4x<00>], cs6; +--:2:-:-:1 @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2 + + + +--:-:-:-:5 RET; + diff --git a/Assembler/PascalAs/sgemm/sgemm64.sass b/Assembler/PascalAs/sgemm/sgemm64.sass new file mode 100644 index 0000000..f037b3e --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm64.sass @@ -0,0 +1,398 @@ +# Kernel: sgemm_kernel_64 +# +# SharedSize: 8192 +# Params(8): +# 0:0x140:4:4 param_C, +# 1:0x144:4:0 param_m, +# 2:0x148:4:0 param_n, +# 3:0x14c:4:0 param_k, +# 4:0x150:4:0 param_lda, +# 5:0x154:4:0 param_ldb, +# 6:0x158:4:0 param_ldc +# 7:0x15c:4:0 param_alpha +# 8:0x160:4:4 param_D // for diagnostic printf output +# +# Globals: +# c[0x0][0x164]: texA (the value is 1) +# c[0x0][0x168]: texB (the value is 0) + + + + 0-63 ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end + + 80 : zOffset + 0-63 : cz<00-63> + + 3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35> + 35,34,43,42,51,50,59,58 : cx32y<00-03|32-35> + 39,38,47,46,55,54,63,62 : cx33y<00-03|32-35> + 33,32,41,40,49,48,57,56 : cx34y<00-03|32-35> + 37,36,45,44,53,52,61,60 : cx35y<00-03|32-35> + + 64-79 : j0Ax<00-03|32-35>, j0By<00-03|32-35> + 80-95 : j1Ax<00-03|32-35>, j1By<00-03|32-35> + + 64-71 : cs<0-7> + + 96-111 : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3> + + 112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32 + + 72-111 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX + + + +--:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 +--:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 +--:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 + + + +// blk = tid >= 32 ? by : bx; +// ldx = tid >= 32 ? ldb : lda; +// tex = tid >= 32 ? texB : texA; +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1 +06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 +--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150]; +--:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; +--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA +--:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB + +--:-:-:-:1 LOP.AND zOffset, tid, -32; +--:-:-:-:1 STS.128 [zOffset + 4x<16*64>], RZ; + +// tid2 = (tid >> 4) & 1 +// tid15 = tid & 15 +// tid31 = tid & 31 +// tid32 = tid & 32 +--:-:-:-:1 BFE.U32 tid2, tid, 0x104; // 1 bit at position 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid32, tid, 32; + +// ldx4 = ldx * 4; +// ldx8 = ldx * 8; +--:-:-:-:1 SHR.U32 ldx, ldx4, 2; +--:-:-:-:1 IADD ldx8, ldx4, ldx4; + +// track0 = blk*64/4 + tid15 + (ldx * tid2) +--:-:-:-:1 ISCADD track0, blk, tid15, 4; +--:-:-:-:1 XMAD.LO track0, ldx, tid2, track0, xmad_t0; +--:-:-:-:1 IADD3 track2, track0, ldx, ldx; +--:-:-:-:1 IADD track4, track0, ldx4; +--:-:-:-:1 IADD track6, track2, ldx4; + +// writeS = tid15*4*4 + tid2*64*4 +--:-:-:-:1 SHL tid15_4, tid15, 4; +--:-:-:-:1 ISCADD writeS, tid2, tid15_4, 8; + +// writeS += 2048 if tid >= 32 +--:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*64>; + +// int end = track0 + (k-8)*ldx; +--:-:-:-:1 MOV k, c[0x0][0x14c]; +--:-:-:-:1 IADD k, k, -8; +--:-:-:-:1 XMAD.LO end, k, ldx, track0, xmad_end; + +// readAs = ((tid >> 1) & 7) << 4; +--:-:-:-:1 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 2048; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readBs, tid, 0x30; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<8*64>, 4; + + +--:-:1:-:1 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:2:-:1 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:3:-:1 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:4:-:1 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2 + + + + + + return join '', map sprintf("--:-:5:-:1 LDS.U.128 cz%02d, [zOffset + 4x<16*64>];\n", $_ * 4), 0..15; + + + + +01:-:-:-:1 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1 +02:-:-:-:1 STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2 +04:-:-:-:1 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3 +08:-:-:-:1 STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4 + +--:-:-:-:1 IADD track0, track0, ldx8; +--:-:-:-:1 IADD track2, track2, ldx8; +--:-:-:-:1 IADD track4, track4, ldx8; +--:-:-:-:1 IADD track6, track6, ldx8; + +10:-:-:-:5 BAR.SYNC 0; + + + +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*64>; + +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1 + +// Efficiency: +// ffma: 512 +// lds: 32 dual issued +// sts: 4 dual issued +// tex: 4 dual issued +// add: 4 +// xor: 3 +// setp: 1 +// bar: 1 dual issued +// bra: 1 dual issued +// Total: 520 (512/520 = 98.5% FFMA) + +LOOP: + +// Loop end condition +--:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; + + + + my @cOrder; + my @swirl = ([2,0],[2,1],[0,1],[0,0]); + my @x = (0,1,32,33); + foreach my $y (0,2,32,34) + { + foreach my $x (@x) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @x = reverse @x; + } + + my %insert = + ( + j0c31 => "--:-:-:-:1 \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;\n", + j0c33 => "--:-:2:-:1 \@P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2\n", + + j1c31 => "--:-:-:-:1 \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;\n", + j1c33 => "--:-:3:-:1 \@P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3\n", + + j5c30 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2\n", + j5c34 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<2*64>], loadX2;\n", + + j6c30 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3\n", + j6c34 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*64>], loadX6;\n", + + j6c62 => + "01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<16*64>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<16*64>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<16*64>;\n", + + j7c63 => + "--:-:-:-:1 \@P0 IADD track0, track0, ldx8;\n" . + "--:-:-:-:1 \@P0 IADD track2, track2, ldx8;\n" . + "--:-:-:-:1 \@P0 IADD track4, track4, ldx8;\n" . + "--:-:-:-:0 \@P0 IADD track6, track6, ldx8;\n" . + "--:-:-:Y:5 \@P0 BRA LOOP;\n", + ); + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx00, [readAs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBy00, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx32, [readAs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBy32, [readBs + 4x<%d*64 + 32>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $yield = $c == 32 ? 'Y' : '-'; + + my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--',''); + + my $stall = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $comment, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 LOP.AND readAs, readAs, 0x7ff; +--:-:-:-:1 LOP.AND readBs, readBs, 0x7ff; + +// writeCs = (readBs / 4) * 64 + readAs; +--:-:-:-:1 ISCADD writeCs, readBs, readAs, 4; + +// readCs = ((tid32 << 3) + tid31) << 2; +--:-:-:-:1 ISCADD readCs, tid32, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = bx*64 + tid31; +--:-:-:-:1 ISCADD cx, bx, tid31, 6; + +// cy = by*64 + (tid32 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid32, 1; +--:-:-:-:1 ISCADD cy00, by, cy00, 6; + +// C += (cy*ldc + cx) * 4; +--:-:-:-:1 MOV ldc, c[0x0][0x158]; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_ci; +--:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; + +--:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m +--:-:-:-:1 IADD cx, cx, 32; +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m + +// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4 +// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4 +//--:-:-:-:1 MOV gridDimX, c[0x0][0x14]; +//--:-:-:-:1 MOV blckDimX, c[0x0][0x8]; +//--:-:-:-:1 XMAD.LO D, by, gridDimX, bx, xmadD; +//--:-:-:-:1 XMAD.LO D, D, blckDimX, tid, xmadD; +//--:-:-:-:1 ISCADD D, D, c[0x0][0x160], 5; // 4 bytes * 8 vars = 32 or shift 5 + +//--:-:-:-:1 STG.CS [D + 4x<0>], readAs; +//--:-:-:-:1 STG.CS [D + 4x<1>], readBs; +//--:-:-:-:1 STG.CS [D + 4x<2>], writeCs; +//--:-:-:-:1 STG.CS [D + 4x<3>], readCs; +//--:-:-:-:1 STG.CS [D + 4x<4>], cx; +//--:-:-:-:1 STG.CS [D + 4x<5>], cy00; +//--:-:-:-:1 STG.CS [D + 4x<6>], ci; +//--:-:-:-:1 STG.CS [D + 4x<7>], cx35y35; + +--:-:-:-:1 IADD cy00, cy00, -1; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD cy12, cy00, 12; + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 SHL ldc8, ldc, 5; +--:-:-:-:1 ISCADD ldc28, ldc, -ldc4, 7; + +--:-:-:-:1 MOV alpha, c[0x0][0x15c]; +--:-:-:-:1 FMUL cs0, cx00y00, alpha; +--:-:-:-:1 FMUL cs1, cx01y00, alpha; +--:-:-:-:1 FMUL cs2, cx02y00, alpha; +--:-:-:-:1 FMUL cs3, cx03y00, alpha; +--:-:-:-:1 FMUL cs4, cx32y00, alpha; +--:-:-:-:1 FMUL cs5, cx33y00, alpha; +--:-:-:-:1 FMUL cs6, cx34y00, alpha; +--:-:-:-:1 FMUL cs7, cx35y00, alpha; + +--:-:-:-:1 IADD Cy00, Cy00, -ldc1; +--:-:-:-:1 IADD Cy04, Cy00, ldc4; +--:-:-:-:1 IADD Cy08, Cy00, ldc8; +--:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) + + + + + + my $out; + foreach my $y (0..3, 32..35) + { + my ($wait, $comment) = $y == 32 ? ('--', '') : ('02',' // Wait Dep 2'); + + $out .= + "--:-:-:-:1 IADD cy00, cy00, 28;\n" . + "--:-:-:-:1 IADD cy04, cy04, 28;\n" . + "--:-:-:-:1 IADD cy08, cy08, 28;\n" . + "--:-:-:-:1 IADD cy12, cy12, 28;\n\n" . + + "02:-:-:-:1 IADD Cy00, Cy00, ldc28; // Wait Dep 2\n" . + "--:-:-:-:1 IADD Cy04, Cy04, ldc28;\n" . + "--:-:-:-:1 IADD Cy08, Cy08, ldc28;\n" . + "--:-:-:-:1 IADD Cy12, Cy12, ldc28;\n\n" if $y == 32; + + $out .= sprintf( + "%s:-:-:-:1 FMUL cs0, cx00y%02d, alpha;%s\n" . + "--:-:-:-:1 FMUL cs1, cx01y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs2, cx02y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs3, cx03y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs4, cx32y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs5, cx33y%02d, alpha;\n" . + "--:-:-:-:1 FMUL cs6, cx34y%02d, alpha;\n" . + "--:-:-:-:0 FMUL cs7, cx35y%02d, alpha; // Dual Issue\n", + $wait, $y, $comment, ($y) x 7) if $y; + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + + +--:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], cs4; + +--:-:-:-:1 LDS cs0, [readCs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS cs1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:1 LDS cs2, [readCs + 4x<1*64 + 00>]; +--:-:-:-:1 LDS cs3, [readCs + 4x<1*64 + 32>]; +--:-:-:-:1 LDS cs4, [readCs + 4x<2*64 + 00>]; +--:-:-:-:1 LDS cs5, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 LDS cs6, [readCs + 4x<3*64 + 00>]; +--:-:1:-:1 LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1 + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; + +--:-:-:-:1 IADD Cy00, Cy00, ldc1; +--:-:-:-:1 IADD Cy04, Cy04, ldc1; +--:-:-:-:1 IADD Cy08, Cy08, ldc1; +--:-:-:-:1 IADD Cy12, Cy12, ldc1; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m + +01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 +--:-:-:-:1 @P1 STG.CG [Cy00 + 4x<32>], cs1; +--:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; +--:-:-:-:1 @P3 STG.CG [Cy04 + 4x<32>], cs3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m + +--:-:-:-:1 @P0 STG.CG [Cy08 + 4x<00>], cs4; +--:-:-:-:1 @P1 STG.CG [Cy08 + 4x<32>], cs5; +--:-:-:-:1 @P2 STG.CG [Cy12 + 4x<00>], cs6; +--:2:-:-:1 @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2 + + + +--:-:-:-:5 RET; + diff --git a/Assembler/PascalAs/sgemm/sgemm_final_128.sass b/Assembler/PascalAs/sgemm/sgemm_final_128.sass new file mode 100644 index 0000000..ce7b0e7 --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm_final_128.sass @@ -0,0 +1,793 @@ +# Kernel: sgemm_kernel_128 +# Arch: sm_50 +# InsCnt: 770 +# RegCnt: 118 +# SharedSize: 16384 +# BarCnt: 1 +# Params(9): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 +# 3:0x14c:4:0 +# 4:0x150:4:0 +# 5:0x154:4:0 +# 6:0x158:4:0 +# 7:0x15c:4:0 +# 8:0x160:4:0 +# +# Instructions: + +--:-:1:-:1 S2R R112, SR_TID.X; +--:-:2:-:1 S2R R113, SR_CTAID.X; +--:-:3:-:1 S2R R114, SR_CTAID.Y; +01:-:-:Y:1 ISETP.GE.AND P0, PT, R112.reuse, 0x80, PT; +--:-:-:-:1 LOP.AND R117, R112.reuse, 0x1f; +--:-:-:-:1 BFE.U32 R9, R112.reuse, 0x205; +--:-:-:-:1 MOV R13, c[0x0][0x14c]; +--:-:-:-:1 BFE.U32 R4, R112.reuse, 0x301; +--:-:-:-:1 LOP.AND R115, R112.reuse, 0x80; +--:-:-:-:1 LOP.AND R107, R112.reuse, 0x70; +--:-:-:-:1 SHL R16, R117, 0x4; +--:-:-:-:1 LOP.AND R0, R112.reuse, 0x1; +--:-:-:-:1 IADD R13, R13, -0x8; +--:-:-:-:1 LOP.AND R80, R112.reuse, -0x20; +--:-:-:-:1 SHR.U32 R106, R115, 0x4; +--:-:-:-:1 LOP.AND R116, R112, 0x60; +--:-:-:-:1 SHR.U32 R107, R107, 0x3; +--:-:-:-:0 @!P0 MOV R1, c[0x0][0x150]; +--:-:-:-:1 STS.128 [R80+0x2000], RZ; +--:-:-:-:1 @P0 MOV R1, c[0x0][0x154]; +--:-:-:-:1 ISCADD R111, R9, R16, 0x9; +06:-:-:-:1 SEL R12, R114, R113, P0; +--:-:-:-:1 @!P0 MOV32I R110, 0x80000001; +--:-:-:-:1 @P0 MOV32I R110, 0x80000000; +--:-:-:-:1 LOP.OR R106, R106, R4; +--:-:-:-:1 SHR.U32 R8, R1.reuse, 0x2; +--:-:-:-:1 LOP.OR R107, R107, R0; +--:-:-:-:1 ISCADD R104, R12, R117, 0x5; +--:-:-:-:1 IADD R109, R1, R1; +--:-:-:-:1 @P0 IADD R111, R111, 0x1000; +--:-:-:-:1 SHL R106, R106, 0x4; +--:-:-:-:1 XMAD.MRG R5, R8.reuse, R9.H1.reuse, RZ; +--:-:-:-:1 ISCADD R107, R107, 0x1000, 0x4; +--:-:-:-:1 XMAD R104, R8.reuse, R9, R104; +--:-:-:Y:5 XMAD.MRG R20, R13.reuse, R8.H1.reuse, RZ; +--:-:-:-:2 XMAD.PSL.CBCC R104, R8.H1, R5.H1, R104; +--:-:1:-:4 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf; +--:-:-:-:1 IADD R108, R104, R1; +--:-:-:-:1 XMAD R105, R13.reuse, R8, R104; +--:-:2:Y:5 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf; +--:-:-:-:1 XMAD.PSL.CBCC R105, R13.H1, R20.H1, R105; +--:-:3:-:1 LDS.U.128 R0, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R4, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R8, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R12, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R16, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R20, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R24, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R28, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R32, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R36, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R40, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R44, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R48, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R52, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R56, [R80+0x2000]; +--:-:3:-:1 LDS.U.128 R60, [R80+0x2000]; +01:-:-:-:1 STS.128 [R111], R96; +--:-:-:-:0 IADD R104, R104, R109.reuse; +02:-:-:-:1 STS.128 [R111+0x800], R100; +--:-:-:-:0 IADD R108, R108, R109; +04:-:-:-:5 BAR.SYNC 0x0; +--:-:-:-:0 LOP.XOR R111, R111, 0x2000; +--:-:-:-:1 LDS.U.128 R64, [R106]; +--:-:-:-:1 LDS.U.128 R72, [R107]; +--:-:-:-:1 LDS.U.128 R68, [R106+0x100]; +--:-:1:-:1 LDS.U.128 R76, [R107+0x100]; +TARGET1: +--:-:-:-:1 ISETP.LE.AND P0, PT, R104, R105, PT; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R106+0x200]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R107+0x200]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R106+0x300]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R107+0x300]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:0 FFMA R11, R64.reuse, R74, R11; +--:-:2:-:1 @P0 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:0 FFMA R16, R66, R77.reuse, R16; +--:-:3:-:1 @P0 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R106+0x400]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R107+0x400]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R106+0x500]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R107+0x500]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R106+0x600]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R107+0x600]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R106+0x700]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R107+0x700]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R106+0x800]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R107+0x800]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R106+0x900]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R107+0x900]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R106+0xa00]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R107+0xa00]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R106+0xb00]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R107+0xb00]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R106+0xc00]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R107+0xc00]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R106+0xd00]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R107+0xd00]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R106+0xe00]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R107+0xe00]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R106+0xf00]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R107+0xf00]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:0 FFMA R10, R64.reuse, R75, R10; +02:-:-:-:1 @P0 STS.128 [R111], R96; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:0 FFMA R18, R64.reuse, R77.reuse, R18; +04:-:-:-:1 @P0 STS.128 [R111+0x800], R100; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:0 FFMA R26, R64.reuse, R79, R26; +01:-:-:-:5 BAR.SYNC 0x0; +--:-:-:-:1 @P0 LOP.XOR R106, R106, 0x2000; +--:-:-:-:1 @P0 LOP.XOR R107, R107, 0x2000; +--:-:-:-:1 @P0 LOP.XOR R111, R111, 0x2000; +--:-:-:-:1 FFMA R27, R64, R78, R27; +--:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 @P0 LDS.U.128 R64, [R106]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 @P0 LDS.U.128 R72, [R107]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 @P0 LDS.U.128 R68, [R106+0x100]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 @P0 LDS.U.128 R76, [R107+0x100]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +--:-:-:-:1 @P0 IADD R104, R104, R109.reuse; +--:-:-:-:0 @P0 IADD R108, R108, R109; +--:-:-:Y:5 @P0 BRA TARGET1; +--:-:-:-:1 SHR.U32 R84, R115, 0x2; +--:-:-:-:1 MOV R77, c[0x0][0x158]; +--:-:-:-:1 SHR.U32 R80, R116.reuse, 0x1; +--:-:-:-:1 MOV R72, c[0x0][0x15c]; +--:-:-:-:1 SHL R89, R116, 0x4; +--:-:-:-:1 LOP.AND R106, R106, 0xfff; +--:-:-:-:1 LOP.OR R84, R117, R84; +--:-:-:-:1 SHL R81, R77.reuse, 0x2; +--:-:-:-:1 LOP.AND R107, R107, 0xfff; +--:-:-:-:1 ISCADD R80, R114, R80, 0x7; +--:-:-:-:1 FMUL R64, R3, R72.reuse; +--:-:-:-:1 SHL R74, R77.reuse, 0x4; +--:-:-:-:1 LOP.OR R89, R89, R84; +--:-:-:-:1 ISCADD R84, R113, R84, 0x7; +--:-:-:-:1 FMUL R65, R7, R72.reuse; +--:-:-:-:1 SHL R88, R77, 0x5; +--:-:-:-:1 XMAD.MRG R78, R80.reuse, R77.H1.reuse, RZ; +--:-:-:-:1 ISCADD R90, R107, R106, 0x5; +--:-:-:-:1 FMUL R66, R1, R72.reuse; +--:-:-:-:1 SHL R89, R89, 0x2; +--:-:-:-:1 XMAD R73, R80, R77, R84; +--:-:-:-:1 ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT; +--:-:-:-:1 IADD R84, R84, 0x40; +--:-:-:-:1 ISCADD R85, R77, -R74, 0x8; +--:-:-:-:1 FMUL R67, R5, R72.reuse; +--:-:-:-:1 FMUL R68, R35, R72.reuse; +--:-:-:-:1 XMAD.PSL.CBCC R73, R80.H1, R78.H1, R73; +--:-:-:-:1 IADD R80, R80, -0x1; +--:-:-:-:1 ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT; +--:-:-:-:1 FMUL R69, R39, R72.reuse; +--:-:-:-:1 FMUL R70, R33, R72.reuse; +--:-:-:-:1 FMUL R71, R37, R72; +--:-:-:-:1 ISCADD R76, R73, c[0x0][0x140], 0x2; +--:-:-:-:1 IADD R83, R80.reuse, 0x4; +--:-:-:-:1 IADD R86, R80.reuse, 0x8; +--:-:-:-:3 IADD R87, R80, 0xc; +--:-:-:Y:6 IADD R76, R76, -R81; +--:-:-:-:1 IADD R75, R76.reuse, R74; +--:-:-:Y:5 IADD R79, R76, R88.reuse; +--:-:-:-:0 IADD R82, R75, R88; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R2, R72.reuse; +--:-:-:-:1 FMUL R65, R6, R72.reuse; +--:-:-:-:1 FMUL R66, R0, R72.reuse; +--:-:-:-:1 FMUL R67, R4, R72.reuse; +--:-:-:-:1 FMUL R68, R34, R72.reuse; +--:-:-:-:1 FMUL R69, R38, R72.reuse; +--:-:-:-:1 FMUL R70, R32, R72.reuse; +--:-:-:-:0 FMUL R71, R36, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R11, R72.reuse; +--:-:-:-:1 FMUL R65, R15, R72.reuse; +--:-:-:-:1 FMUL R66, R9, R72.reuse; +--:-:-:-:1 FMUL R67, R13, R72.reuse; +--:-:-:-:1 FMUL R68, R43, R72.reuse; +--:-:-:-:1 FMUL R69, R47, R72.reuse; +--:-:-:-:1 FMUL R70, R41, R72.reuse; +--:-:-:-:0 FMUL R71, R45, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R10, R72.reuse; +--:-:-:-:1 FMUL R65, R14, R72.reuse; +--:-:-:-:1 FMUL R66, R8, R72.reuse; +--:-:-:-:1 FMUL R67, R12, R72.reuse; +--:-:-:-:1 FMUL R68, R42, R72.reuse; +--:-:-:-:1 FMUL R69, R46, R72.reuse; +--:-:-:-:1 FMUL R70, R40, R72.reuse; +--:-:-:-:0 FMUL R71, R44, R72; +--:-:-:-:5 CAL TARGET2; +--:-:-:-:1 IADD R80, R80, 0x3c; +--:-:-:-:1 IADD R83, R83, 0x3c; +--:-:-:-:1 IADD R86, R86, 0x3c; +--:-:-:-:1 IADD R87, R87, 0x3c; +02:-:-:-:1 IADD R76, R76, R85.reuse; +--:-:-:-:1 IADD R75, R75, R85.reuse; +--:-:-:-:1 IADD R79, R79, R85.reuse; +--:-:-:-:1 IADD R82, R82, R85; +--:-:-:-:1 FMUL R64, R19, R72.reuse; +--:-:-:-:1 FMUL R65, R23, R72.reuse; +--:-:-:-:1 FMUL R66, R17, R72.reuse; +--:-:-:-:1 FMUL R67, R21, R72.reuse; +--:-:-:-:1 FMUL R68, R51, R72.reuse; +--:-:-:-:1 FMUL R69, R55, R72.reuse; +--:-:-:-:1 FMUL R70, R49, R72.reuse; +--:-:-:-:0 FMUL R71, R53, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R18, R72.reuse; +--:-:-:-:1 FMUL R65, R22, R72.reuse; +--:-:-:-:1 FMUL R66, R16, R72.reuse; +--:-:-:-:1 FMUL R67, R20, R72.reuse; +--:-:-:-:1 FMUL R68, R50, R72.reuse; +--:-:-:-:1 FMUL R69, R54, R72.reuse; +--:-:-:-:1 FMUL R70, R48, R72.reuse; +--:-:-:-:0 FMUL R71, R52, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R27, R72.reuse; +--:-:-:-:1 FMUL R65, R31, R72.reuse; +--:-:-:-:1 FMUL R66, R25, R72.reuse; +--:-:-:-:1 FMUL R67, R29, R72.reuse; +--:-:-:-:1 FMUL R68, R59, R72.reuse; +--:-:-:-:1 FMUL R69, R63, R72.reuse; +--:-:-:-:1 FMUL R70, R57, R72.reuse; +--:-:-:-:0 FMUL R71, R61, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R26, R72.reuse; +--:-:-:-:1 FMUL R65, R30, R72.reuse; +--:-:-:-:1 FMUL R66, R24, R72.reuse; +--:-:-:-:1 FMUL R67, R28, R72.reuse; +--:-:-:-:1 FMUL R68, R58, R72.reuse; +--:-:-:-:1 FMUL R69, R62, R72.reuse; +--:-:-:-:1 FMUL R70, R56, R72.reuse; +--:-:-:-:0 FMUL R71, R60, R72; +--:-:-:-:5 CAL TARGET2; +--:-:-:-:5 EXIT; +TARGET2: +--:-:-:-:0 IADD R80, R80, 0x1; +--:-:-:-:1 STS.128 [R90], R64; +--:-:-:-:0 IADD R83, R83, 0x1; +--:-:-:-:1 STS.128 [R90+0x100], R68; +--:-:-:-:0 IADD R86, R86, 0x1; +--:-:-:-:1 LDS R64, [R89]; +--:-:-:-:0 IADD R87, R87, 0x1; +--:-:-:-:1 LDS R65, [R89+0x100]; +--:-:-:-:0 IADD R76, R76, R81.reuse; +--:-:-:-:1 LDS R66, [R89+0x200]; +--:-:-:-:0 IADD R75, R75, R81.reuse; +--:-:-:-:1 LDS R67, [R89+0x300]; +--:-:-:-:0 IADD R79, R79, R81.reuse; +--:-:-:-:1 LDS R68, [R89+0x400]; +--:-:-:-:0 IADD R82, R82, R81; +--:-:-:-:1 LDS R69, [R89+0x500]; +--:-:-:-:1 ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5; +--:-:-:-:1 LDS R70, [R89+0x600]; +--:-:-:-:1 ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6; +--:-:1:-:1 LDS R71, [R89+0x700]; +--:-:-:-:2 ISETP.LT.AND P2, PT, R83.reuse, c[0x0][0x148], P5; +--:-:-:Y:7 ISETP.LT.AND P3, PT, R83, c[0x0][0x148], P6; +01:-:-:-:1 @P0 STG.CG [R76], R64; +--:-:-:-:1 ISETP.LT.AND P0, PT, R86.reuse, c[0x0][0x148], P5; +--:-:-:-:1 @P1 STG.CG [R76+0x100], R65; +--:-:-:-:1 ISETP.LT.AND P1, PT, R86, c[0x0][0x148], P6; +--:-:-:-:1 @P2 STG.CG [R75], R66; +--:-:-:-:1 ISETP.LT.AND P2, PT, R87.reuse, c[0x0][0x148], P5; +--:-:-:-:1 @P3 STG.CG [R75+0x100], R67; +--:-:-:Y:7 ISETP.LT.AND P3, PT, R87, c[0x0][0x148], P6; +--:-:-:-:2 @P0 STG.CG [R79], R68; +--:-:-:-:2 @P1 STG.CG [R79+0x100], R69; +--:-:-:-:2 @P2 STG.CG [R82], R70; +--:2:-:-:1 @P3 STG.CG [R82+0x100], R71; +--:-:-:-:5 RET; diff --git a/Assembler/PascalAs/sgemm/sgemm_final_64.sass b/Assembler/PascalAs/sgemm/sgemm_final_64.sass new file mode 100644 index 0000000..815ae5d --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm_final_64.sass @@ -0,0 +1,802 @@ +# Kernel: sgemm_kernel_64 +# Arch: sm_50 +# InsCnt: 779 +# RegCnt: 127 +# SharedSize: 8192 +# BarCnt: 1 +# Params(9): +# ord:addr:size:align +# 0:0x140:4:0 +# 1:0x144:4:0 +# 2:0x148:4:0 +# 3:0x14c:4:0 +# 4:0x150:4:0 +# 5:0x154:4:0 +# 6:0x158:4:0 +# 7:0x15c:4:0 +# 8:0x160:4:0 +# +# Instructions: + +--:-:1:-:1 S2R R119, SR_TID.X; +--:-:2:-:1 S2R R125, SR_CTAID.X; +--:-:3:-:1 S2R R122, SR_CTAID.Y; +01:-:-:-:1 ISETP.GE.AND P0, PT, R119.reuse, 0x20, PT; +--:-:-:-:1 LOP.AND R9, R119.reuse, 0xf; +--:-:-:-:1 BFE.U32 R4, R119.reuse, 0x104; +--:-:-:-:1 MOV R12, c[0x0][0x14c]; +--:-:-:-:1 BFE.U32 R114, R119.reuse, 0x301; +--:-:-:-:1 LOP.AND R115, R119.reuse, 0x30; +--:-:-:-:1 LOP.AND R0, R119.reuse, 0x1; +--:-:-:-:1 SHL R13, R9, 0x4; +--:-:-:-:1 LOP.AND R80, R119.reuse, -0x20; +--:-:-:-:1 IADD R12, R12, -0x8; +--:-:-:-:1 SHL R114, R114, 0x4; +--:-:-:-:1 LOP.AND R126, R119, 0x1f; +--:-:-:-:1 SHR.U32 R115, R115, 0x3; +--:-:-:-:0 @!P0 MOV R2, c[0x0][0x150]; +--:-:-:-:1 STS.128 [R80+0x1000], RZ; +--:-:-:-:1 @P0 MOV R2, c[0x0][0x154]; +--:-:-:-:1 ISCADD R118, R4, R13, 0x8; +06:-:-:-:1 SEL R8, R122, R125, P0; +--:-:-:-:1 @!P0 MOV32I R113, 0x80000001; +--:-:-:-:1 @P0 MOV32I R113, 0x80000000; +--:-:-:-:1 LOP.OR R115, R115, R0; +--:-:-:-:1 SHR.U32 R1, R2.reuse, 0x2; +--:-:-:-:1 LOP.AND R123, R119, 0x20; +--:-:-:-:1 ISCADD R112, R8, R9, 0x4; +--:-:-:-:1 IADD R121, R2, R2; +--:-:-:-:1 @P0 IADD R118, R118, 0x800; +--:-:-:-:1 ISCADD R115, R115, 0x800, 0x4; +--:-:-:-:1 XMAD.MRG R5, R1.reuse, R4.H1.reuse, RZ; +--:-:-:-:1 XMAD.MRG R16, R12.reuse, R1.H1.reuse, RZ; +--:-:-:Y:6 XMAD R112, R1.reuse, R4, R112; +--:-:-:-:2 XMAD.PSL.CBCC R112, R1.H1, R5.H1, R112; +--:-:1:-:4 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf; +--:-:-:-:1 IADD3 R116, R112.reuse, R1.reuse, R1; +--:-:-:-:1 IADD R120, R112, R2.reuse; +--:-:2:-:1 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf; +--:-:-:-:0 XMAD R117, R12.reuse, R1, R112; +--:-:3:-:3 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf; +--:-:-:-:2 IADD R124, R116, R2; +--:-:4:-:1 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf; +--:-:-:-:1 XMAD.PSL.CBCC R117, R12.H1, R16.H1, R117; +--:-:5:-:1 LDS.U.128 R0, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R4, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R8, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R12, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R16, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R20, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R24, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R28, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R32, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R36, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R40, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R44, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R48, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R52, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R56, [R80+0x1000]; +--:-:5:-:1 LDS.U.128 R60, [R80+0x1000]; +01:-:-:-:1 STS.128 [R118], R96; +--:-:-:-:0 IADD R112, R112, R121.reuse; +02:-:-:-:1 STS.128 [R118+0x200], R100; +--:-:-:-:0 IADD R116, R116, R121.reuse; +04:-:-:-:1 STS.128 [R118+0x400], R104; +--:-:-:-:0 IADD R120, R120, R121.reuse; +08:-:-:-:1 STS.128 [R118+0x600], R108; +--:-:-:-:0 IADD R124, R124, R121; +10:-:-:-:5 BAR.SYNC 0x0; +--:-:-:-:0 LOP.XOR R118, R118, 0x1000; +--:-:-:-:1 LDS.U.128 R64, [R114]; +--:-:-:-:1 LDS.U.128 R72, [R115]; +--:-:-:-:1 LDS.U.128 R68, [R114+0x80]; +--:-:1:-:1 LDS.U.128 R76, [R115+0x80]; +TARGET1: +--:-:-:-:1 ISETP.LE.AND P0, PT, R112, R117, PT; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R114+0x100]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R115+0x100]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R114+0x180]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R115+0x180]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:0 FFMA R11, R64.reuse, R74, R11; +--:-:-:-:1 @P0 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:0 FFMA R16, R66, R77.reuse, R16; +--:-:2:-:1 @P0 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R114+0x200]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R115+0x200]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R114+0x280]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R115+0x280]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:0 FFMA R11, R80.reuse, R90, R11; +--:-:-:-:1 @P0 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:0 FFMA R16, R82, R93.reuse, R16; +--:-:3:-:1 @P0 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R114+0x300]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R115+0x300]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R114+0x380]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R115+0x380]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R114+0x400]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R115+0x400]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R114+0x480]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R115+0x480]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R114+0x500]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R115+0x500]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R114+0x580]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R115+0x580]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:1 FFMA R10, R64.reuse, R75, R10; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:1 FFMA R26, R64.reuse, R79, R26; +--:-:-:-:1 FFMA R27, R64, R78, R27; +01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 LDS.U.128 R64, [R114+0x600]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 LDS.U.128 R72, [R115+0x600]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 LDS.U.128 R68, [R114+0x680]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 LDS.U.128 R76, [R115+0x680]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:0 FFMA R10, R80.reuse, R91, R10; +02:-:-:-:1 @P0 STS.128 [R118], R96; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:0 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 @P0 STS.128 [R118+0x200], R100; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; +--:-:-:-:1 LDS.U.128 R80, [R114+0x700]; +--:-:-:-:1 FFMA R0, R66, R73.reuse, R0; +--:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; +--:-:-:-:1 LDS.U.128 R88, [R115+0x700]; +--:-:-:-:1 FFMA R3, R64, R72.reuse, R3; +--:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; +--:-:-:-:1 LDS.U.128 R84, [R114+0x780]; +--:-:-:-:1 FFMA R4, R67, R73.reuse, R4; +--:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; +--:-:1:-:1 LDS.U.128 R92, [R115+0x780]; +--:-:-:-:1 FFMA R7, R65, R72.reuse, R7; +--:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; +--:-:-:-:1 FFMA R32, R70, R73.reuse, R32; +--:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; +--:-:-:-:1 FFMA R35, R68, R72.reuse, R35; +--:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; +--:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; +--:-:-:-:1 FFMA R38, R69.reuse, R73, R38; +--:-:-:-:1 FFMA R39, R69.reuse, R72, R39; +--:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; +--:-:-:-:1 FFMA R44, R71, R75.reuse, R44; +--:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; +--:-:-:-:1 FFMA R47, R69, R74.reuse, R47; +--:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; +--:-:-:-:1 FFMA R40, R70, R75.reuse, R40; +--:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; +--:-:-:-:1 FFMA R43, R68, R74.reuse, R43; +--:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; +--:-:-:-:1 FFMA R12, R67, R75.reuse, R12; +--:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; +--:-:-:-:1 FFMA R15, R65, R74.reuse, R15; +--:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; +--:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; +--:-:-:-:0 FFMA R10, R64.reuse, R75, R10; +04:-:-:-:1 @P0 STS.128 [R118+0x400], R104; +--:-:-:-:1 FFMA R11, R64.reuse, R74, R11; +--:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; +--:-:-:-:1 FFMA R16, R66, R77.reuse, R16; +--:-:-:-:0 FFMA R18, R64.reuse, R77.reuse, R18; +--:-:-:-:1 @P0 STS.128 [R118+0x600], R108; +--:-:-:-:1 FFMA R19, R64, R76.reuse, R19; +--:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; +--:-:-:-:1 FFMA R20, R67, R77.reuse, R20; +--:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; +--:-:-:-:1 FFMA R23, R65, R76.reuse, R23; +--:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; +--:-:-:-:1 FFMA R48, R70, R77.reuse, R48; +--:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; +--:-:-:-:1 FFMA R51, R68, R76.reuse, R51; +--:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; +--:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; +--:-:-:-:1 FFMA R54, R69.reuse, R77, R54; +--:-:-:-:1 FFMA R55, R69.reuse, R76, R55; +--:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; +--:-:-:-:1 FFMA R60, R71, R79.reuse, R60; +--:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; +--:-:-:-:1 FFMA R63, R69, R78.reuse, R63; +--:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; +--:-:-:-:1 FFMA R56, R70, R79.reuse, R56; +--:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; +--:-:-:-:1 FFMA R59, R68, R78.reuse, R59; +--:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; +--:-:-:-:1 FFMA R28, R67, R79.reuse, R28; +--:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; +--:-:-:-:1 FFMA R31, R65, R78.reuse, R31; +--:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; +--:-:-:-:1 FFMA R24, R66, R79.reuse, R24; +--:-:-:-:0 FFMA R26, R64.reuse, R79, R26; +01:-:-:-:5 BAR.SYNC 0x0; +--:-:-:-:1 @P0 LOP.XOR R114, R114, 0x1000; +--:-:-:-:1 @P0 LOP.XOR R115, R115, 0x1000; +--:-:-:-:1 @P0 LOP.XOR R118, R118, 0x1000; +--:-:-:-:1 FFMA R27, R64, R78, R27; +--:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; +--:-:-:-:1 @P0 LDS.U.128 R64, [R114]; +--:-:-:-:1 FFMA R0, R82, R89.reuse, R0; +--:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; +--:-:-:-:1 @P0 LDS.U.128 R72, [R115]; +--:-:-:-:1 FFMA R3, R80, R88.reuse, R3; +--:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; +--:-:-:-:1 @P0 LDS.U.128 R68, [R114+0x80]; +--:-:-:-:1 FFMA R4, R83, R89.reuse, R4; +--:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; +--:-:1:-:1 @P0 LDS.U.128 R76, [R115+0x80]; +--:-:-:-:1 FFMA R7, R81, R88.reuse, R7; +--:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; +--:-:-:-:1 FFMA R32, R86, R89.reuse, R32; +--:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; +--:-:-:-:1 FFMA R35, R84, R88.reuse, R35; +--:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; +--:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; +--:-:-:-:1 FFMA R38, R85.reuse, R89, R38; +--:-:-:-:1 FFMA R39, R85.reuse, R88, R39; +--:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; +--:-:-:-:1 FFMA R44, R87, R91.reuse, R44; +--:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; +--:-:-:-:1 FFMA R47, R85, R90.reuse, R47; +--:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; +--:-:-:-:1 FFMA R40, R86, R91.reuse, R40; +--:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; +--:-:-:-:1 FFMA R43, R84, R90.reuse, R43; +--:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; +--:-:-:-:1 FFMA R12, R83, R91.reuse, R12; +--:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; +--:-:-:-:1 FFMA R15, R81, R90.reuse, R15; +--:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; +--:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; +--:-:-:-:1 FFMA R10, R80.reuse, R91, R10; +--:-:-:-:1 FFMA R11, R80.reuse, R90, R11; +--:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; +--:-:-:-:1 FFMA R16, R82, R93.reuse, R16; +--:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; +--:-:-:-:1 FFMA R19, R80, R92.reuse, R19; +--:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; +--:-:-:-:1 FFMA R20, R83, R93.reuse, R20; +--:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; +--:-:-:-:1 FFMA R23, R81, R92.reuse, R23; +--:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; +--:-:-:-:1 FFMA R48, R86, R93.reuse, R48; +--:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; +--:-:-:-:1 FFMA R51, R84, R92.reuse, R51; +--:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; +--:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; +--:-:-:-:1 FFMA R54, R85.reuse, R93, R54; +--:-:-:-:1 FFMA R55, R85.reuse, R92, R55; +--:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; +--:-:-:-:1 FFMA R60, R87, R95.reuse, R60; +--:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; +--:-:-:-:1 FFMA R63, R85, R94.reuse, R63; +--:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; +--:-:-:-:1 FFMA R56, R86, R95.reuse, R56; +--:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; +--:-:-:-:1 FFMA R59, R84, R94.reuse, R59; +--:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; +--:-:-:-:1 FFMA R28, R83, R95.reuse, R28; +--:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; +--:-:-:-:1 FFMA R31, R81, R94.reuse, R31; +--:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; +--:-:-:-:1 FFMA R24, R82, R95.reuse, R24; +--:-:-:-:1 FFMA R26, R80.reuse, R95, R26; +--:-:-:-:1 FFMA R27, R80, R94, R27; +--:-:-:-:1 @P0 IADD R112, R112, R121.reuse; +--:-:-:-:1 @P0 IADD R116, R116, R121.reuse; +--:-:-:-:1 @P0 IADD R120, R120, R121.reuse; +--:-:-:-:0 @P0 IADD R124, R124, R121; +--:-:-:Y:5 @P0 BRA TARGET1; +--:-:-:-:1 SHR.U32 R80, R123.reuse, 0x1; +--:-:-:-:1 MOV R81, c[0x0][0x158]; +--:-:-:-:1 ISCADD R84, R125, R126.reuse, 0x6; +--:-:-:-:1 MOV R72, c[0x0][0x15c]; +--:-:-:-:1 ISCADD R92, R123, R126, 0x3; +--:-:-:-:1 LOP.AND R114, R114, 0x7ff; +--:-:-:-:1 ISCADD R80, R122, R80, 0x6; +--:-:-:-:1 LOP.AND R115, R115, 0x7ff; +--:-:-:-:1 SHL R77, R81.reuse, 0x2; +--:-:-:-:1 ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT; +--:-:-:-:1 SHL R89, R81.reuse, 0x4; +--:-:-:-:1 FMUL R64, R3, R72; +--:-:-:-:1 SHL R91, R81.reuse, 0x5; +--:-:-:-:1 XMAD.MRG R74, R80.reuse, R81.H1.reuse, RZ; +--:-:-:-:1 ISCADD R93, R115, R114, 0x4; +--:-:-:-:1 XMAD R73, R80, R81, R84; +--:-:-:-:1 SHL R92, R92, 0x2; +--:-:-:-:1 IADD R84, R84, 0x20; +--:-:-:-:1 ISCADD R85, R81, -R89, 0x7; +--:-:-:-:1 FMUL R65, R7, R72.reuse; +--:-:-:-:1 FMUL R66, R1, R72.reuse; +--:-:-:-:1 XMAD.PSL.CBCC R73, R80.H1, R74.H1, R73; +--:-:-:-:1 IADD R80, R80, -0x1; +--:-:-:-:1 ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT; +--:-:-:-:1 FMUL R67, R5, R72.reuse; +--:-:-:-:1 FMUL R68, R35, R72.reuse; +--:-:-:-:1 FMUL R69, R39, R72.reuse; +--:-:-:-:1 ISCADD R76, R73, c[0x0][0x140], 0x2; +--:-:-:-:1 IADD R86, R80.reuse, 0x4; +--:-:-:-:1 IADD R87, R80.reuse, 0x8; +--:-:-:-:1 IADD R88, R80, 0xc; +--:-:-:-:1 FMUL R70, R33, R72.reuse; +--:-:-:-:1 FMUL R71, R37, R72; +--:-:-:Y:6 IADD R76, R76, -R77; +--:-:-:-:1 IADD R75, R76.reuse, R89; +--:-:-:Y:5 IADD R78, R76, R91.reuse; +--:-:-:-:0 IADD R79, R75, R91; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R2, R72.reuse; +--:-:-:-:1 FMUL R65, R6, R72.reuse; +--:-:-:-:1 FMUL R66, R0, R72.reuse; +--:-:-:-:1 FMUL R67, R4, R72.reuse; +--:-:-:-:1 FMUL R68, R34, R72.reuse; +--:-:-:-:1 FMUL R69, R38, R72.reuse; +--:-:-:-:1 FMUL R70, R32, R72.reuse; +--:-:-:-:0 FMUL R71, R36, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R11, R72.reuse; +--:-:-:-:1 FMUL R65, R15, R72.reuse; +--:-:-:-:1 FMUL R66, R9, R72.reuse; +--:-:-:-:1 FMUL R67, R13, R72.reuse; +--:-:-:-:1 FMUL R68, R43, R72.reuse; +--:-:-:-:1 FMUL R69, R47, R72.reuse; +--:-:-:-:1 FMUL R70, R41, R72.reuse; +--:-:-:-:0 FMUL R71, R45, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R10, R72.reuse; +--:-:-:-:1 FMUL R65, R14, R72.reuse; +--:-:-:-:1 FMUL R66, R8, R72.reuse; +--:-:-:-:1 FMUL R67, R12, R72.reuse; +--:-:-:-:1 FMUL R68, R42, R72.reuse; +--:-:-:-:1 FMUL R69, R46, R72.reuse; +--:-:-:-:1 FMUL R70, R40, R72.reuse; +--:-:-:-:0 FMUL R71, R44, R72; +--:-:-:-:5 CAL TARGET2; +--:-:-:-:1 IADD R80, R80, 0x1c; +--:-:-:-:1 IADD R86, R86, 0x1c; +--:-:-:-:1 IADD R87, R87, 0x1c; +--:-:-:-:1 IADD R88, R88, 0x1c; +02:-:-:-:1 IADD R76, R76, R85.reuse; +--:-:-:-:1 IADD R75, R75, R85.reuse; +--:-:-:-:1 IADD R78, R78, R85.reuse; +--:-:-:-:1 IADD R79, R79, R85; +--:-:-:-:1 FMUL R64, R19, R72.reuse; +--:-:-:-:1 FMUL R65, R23, R72.reuse; +--:-:-:-:1 FMUL R66, R17, R72.reuse; +--:-:-:-:1 FMUL R67, R21, R72.reuse; +--:-:-:-:1 FMUL R68, R51, R72.reuse; +--:-:-:-:1 FMUL R69, R55, R72.reuse; +--:-:-:-:1 FMUL R70, R49, R72.reuse; +--:-:-:-:0 FMUL R71, R53, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R18, R72.reuse; +--:-:-:-:1 FMUL R65, R22, R72.reuse; +--:-:-:-:1 FMUL R66, R16, R72.reuse; +--:-:-:-:1 FMUL R67, R20, R72.reuse; +--:-:-:-:1 FMUL R68, R50, R72.reuse; +--:-:-:-:1 FMUL R69, R54, R72.reuse; +--:-:-:-:1 FMUL R70, R48, R72.reuse; +--:-:-:-:0 FMUL R71, R52, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R27, R72.reuse; +--:-:-:-:1 FMUL R65, R31, R72.reuse; +--:-:-:-:1 FMUL R66, R25, R72.reuse; +--:-:-:-:1 FMUL R67, R29, R72.reuse; +--:-:-:-:1 FMUL R68, R59, R72.reuse; +--:-:-:-:1 FMUL R69, R63, R72.reuse; +--:-:-:-:1 FMUL R70, R57, R72.reuse; +--:-:-:-:0 FMUL R71, R61, R72; +--:-:-:-:5 CAL TARGET2; +02:-:-:-:1 FMUL R64, R26, R72.reuse; +--:-:-:-:1 FMUL R65, R30, R72.reuse; +--:-:-:-:1 FMUL R66, R24, R72.reuse; +--:-:-:-:1 FMUL R67, R28, R72.reuse; +--:-:-:-:1 FMUL R68, R58, R72.reuse; +--:-:-:-:1 FMUL R69, R62, R72.reuse; +--:-:-:-:1 FMUL R70, R56, R72.reuse; +--:-:-:-:0 FMUL R71, R60, R72; +--:-:-:-:5 CAL TARGET2; +--:-:-:-:5 EXIT; +TARGET2: +--:-:-:-:0 IADD R80, R80, 0x1; +--:-:-:-:1 STS.128 [R93], R64; +--:-:-:-:0 IADD R86, R86, 0x1; +--:-:-:-:1 STS.128 [R93+0x80], R68; +--:-:-:-:0 IADD R87, R87, 0x1; +--:-:-:-:1 LDS R64, [R92]; +--:-:-:-:0 IADD R88, R88, 0x1; +--:-:-:-:1 LDS R65, [R92+0x80]; +--:-:-:-:0 IADD R76, R76, R77.reuse; +--:-:-:-:1 LDS R66, [R92+0x100]; +--:-:-:-:0 IADD R75, R75, R77.reuse; +--:-:-:-:1 LDS R67, [R92+0x180]; +--:-:-:-:0 IADD R78, R78, R77.reuse; +--:-:-:-:1 LDS R68, [R92+0x200]; +--:-:-:-:0 IADD R79, R79, R77; +--:-:-:-:1 LDS R69, [R92+0x280]; +--:-:-:-:1 ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5; +--:-:-:-:1 LDS R70, [R92+0x300]; +--:-:-:-:1 ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6; +--:-:1:-:1 LDS R71, [R92+0x380]; +--:-:-:-:2 ISETP.LT.AND P2, PT, R86.reuse, c[0x0][0x148], P5; +--:-:-:Y:7 ISETP.LT.AND P3, PT, R86, c[0x0][0x148], P6; +01:-:-:-:1 @P0 STG.CG [R76], R64; +--:-:-:-:1 ISETP.LT.AND P0, PT, R87.reuse, c[0x0][0x148], P5; +--:-:-:-:1 @P1 STG.CG [R76+0x80], R65; +--:-:-:-:1 ISETP.LT.AND P1, PT, R87, c[0x0][0x148], P6; +--:-:-:-:1 @P2 STG.CG [R75], R66; +--:-:-:-:1 ISETP.LT.AND P2, PT, R88.reuse, c[0x0][0x148], P5; +--:-:-:-:1 @P3 STG.CG [R75+0x80], R67; +--:-:-:Y:7 ISETP.LT.AND P3, PT, R88, c[0x0][0x148], P6; +--:-:-:-:2 @P0 STG.CG [R78], R68; +--:-:-:-:2 @P1 STG.CG [R78+0x80], R69; +--:-:-:-:2 @P2 STG.CG [R79], R70; +--:2:-:-:1 @P3 STG.CG [R79+0x80], R71; +--:-:-:-:5 RET; diff --git a/Assembler/PascalAs/sgemm/sgemm_pre_128.sass b/Assembler/PascalAs/sgemm/sgemm_pre_128.sass new file mode 100644 index 0000000..cde320e --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm_pre_128.sass @@ -0,0 +1,924 @@ +# Kernel: sgemm_kernel_128 +# +# SharedSize: 16384 +# Params(8): +# 0:0x140:4:4 param_C, +# 1:0x144:4:0 param_m, +# 2:0x148:4:0 param_n, +# 3:0x14c:4:0 param_k, +# 4:0x150:4:0 param_lda, +# 5:0x154:4:0 param_ldb, +# 6:0x158:4:0 param_ldc +# 7:0x15c:4:0 param_alpha +# 8:0x160:4:4 param_D // for diagnostic printf output +# +# Globals: +# c[0x0][0x164]: texA (the value is 1) +# c[0x0][0x168]: texB (the value is 0) + + + + // Temporary registers to calculate the state registers. Reuse the C output registers. + // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts. + 0-63 ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy + + // Aliases for the C registers we use for initializing C (used as vectors) + 0-63 : cz<00-63> + + // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers + 80 : zOffset + + // 64 C maxtrix output registers. + // Use special mapping to avoid register bank conflicts between these registers and the blocking registers. + 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> + 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> + 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> + 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> + 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> + + // Double buffered register blocking used in vector loads. + // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags + 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> + 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> + + // Registers to load A or B + 96-103 : loadX<0-7> + + // Key global state registers for main loop and some we reuse for outputing C. + // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of + // delayed bank conflicts between memory operations and ffmas. + // The array index bracket notation can be used to request a bank in a dynamically allocated range. + 104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs + + // Registers to store the results back to global memory. Reuse any register not needed after the main loop. + // Statically allocate cs0-7 because they're vector registers. + 64-71 : cs<0-7> + + // dynamically allocated C output registers(~) + 72-103 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX + + + +// Note the absense of the loading of the stack pointer into R1. +// No idea why ptxas does that anyway when it's not used for register spilling. +// Such a waste of a perfectly good register. + +// Scheduler doesn't handle the dependency flags yet, +// so move these first instructions outside the block that's auto scheduled +//--:-:-:-:1 CS2R clock, SR_CLOCKLO; +//--:-:-:-:1 S2R smId, SR_VIRTID; +//--:-:-:-:1 S2R nSMs, SR_VIRTCFG; +--:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 +--:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 +--:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 + +// Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies +// Memory dependencies are left up to the auther to deal with manually for now. +01:-:-:Y:1 ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 BFE.U32 tid4, tid, 0x205; // 2 bits at position 5 +--:-:-:-:1 MOV k, c[0x0][0x14c]; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 LOP.AND readBs, tid, 0x70; +--:-:-:-:1 SHL tid31_4, tid31, 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 IADD k, k, -8; +--:-:-:-:1 LOP.AND zOffset, tid, -32; +--:-:-:-:1 SHR.U32 readAs, tid128, 4; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:0 @!P0 MOV ldx4, c[0x0][0x150]; +--:-:-:-:1 STS.128 [zOffset + 4x<16*128>], RZ; +--:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; +--:-:-:-:1 ISCADD writeS, tid4, tid31_4, 9; +06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 +--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA +--:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB +--:-:-:-:1 LOP.OR readAs, readAs, tid7; +--:-:-:-:1 SHR.U32 ldx, ldx4, 2; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD track0, blk, tid31, 5; +--:-:-:-:1 IADD ldx8, ldx4, ldx4; +--:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*128>; +--:-:-:-:1 SHL readAs, readAs, 4; +--:-:-:-:1 XMAD.MRG xmad_t0, ldx, tid4.H1, RZ; // XMAD.LO is a macro that is expanded out into the 3 XMADs +--:-:-:-:1 ISCADD readBs, readBs, 4x<8*128>, 4; +--:-:-:-:1 XMAD track0, ldx, tid4, track0; +--:-:-:Y:5 XMAD.MRG xmad_end, k, ldx.H1, RZ; +--:-:-:-:2 XMAD.PSL.CBCC track0, ldx.H1, xmad_t0.H1, track0; +--:-:1:-:4 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:-:-:1 IADD track4, track0, ldx4; +--:-:-:-:1 XMAD end, k, ldx, track0; +--:-:2:Y:5 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:-:1 XMAD.PSL.CBCC end, k.H1, xmad_end.H1, end; + +// Initialize C registeres to zero +// Using LDS.U.128 is a neat trick to save a few clock cyles +// (when you have enough warps to hide the latency.) +--:-:3:-:1 LDS.U.128 cz00, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz04, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz08, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz12, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz16, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz20, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz24, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz28, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz32, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz36, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz40, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz44, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz48, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz52, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz56, [zOffset + 4x<16*128>]; +--:-:3:-:1 LDS.U.128 cz60, [zOffset + 4x<16*128>]; + +// These instuctions need to occur after the textures load so put them in a new block +// that starts with a dependency barrier wait. +01:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1 +--:-:-:-:0 IADD track0, track0, ldx8; +02:-:-:-:1 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2 +--:-:-:-:0 IADD track4, track4, ldx8; +04:-:-:-:5 BAR.SYNC 0; + +// The next store to shared goes to high area. +// Having 2 share buffers allows us to eliminate a bar.sync in the main loop. +// This way we don't have to wait for all threads to arrive before writing fresh data to shared. +// Other threads can continue reading from the last batch while the new data is being written. +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*128>; + +// Preload the fist lines of A and B from shared +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1 + + +// The main loop +// While calculating the first line, load in the next line from shared. +// Shared memory stores enough to do this 8 times per loop. +// Also pull in the next block of memory from global and store it to shared. + +// Efficiency: +// ffma: 512 +// lds: 32 dual issued +// sts: 2 dual issued +// tex: 2 dual issued +// add: 2 +// xor: 3 +// setp: 1 +// bar: 1 dual issued +// bra: 1 dual issued +// Total: 524 (512/518 = 98.8% FFMA) + +// Memory Throughput Upper Bound: +// 2 * 4 * 4 bytes per thread per 518 clocks +// 128 threads per SM +// 16 SM's (GM204) +// 1640Mhz (boost overclock) +// .931 GiB/GB (1000^3 / 1024^3) +// 193 GiB/sec +// Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz) + +LOOP: + +// Loop end condition +--:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; + +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<1*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<1*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax64, [readAs + 4x<1*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By64, [readBs + 4x<1*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j0Ax66, j0By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j0Ax66, j0By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j0Ax64, j0By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j0Ax64, j0By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j0Ax67, j0By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j0Ax67, j0By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j0Ax65, j0By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j0Ax65, j0By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j0Ax67, j0By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j0Ax67, j0By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j0Ax65, j0By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j0Ax65, j0By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j0Ax66, j0By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j0Ax66, j0By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j0Ax64, j0By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j0Ax64, j0By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:0 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:2:-:1 @P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:Y:1 FFMA cx02y64, j0Ax02, j0By64, cx02y64; +--:-:-:-:0 FFMA cx02y65, j0Ax02, j0By65, cx02y65; +--:-:3:-:1 @P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3 +--:-:-:-:1 FFMA cx00y65, j0Ax00, j0By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j0Ax00, j0By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j0Ax03, j0By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j0Ax03, j0By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j0Ax01, j0By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j0Ax01, j0By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j0Ax66, j0By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j0Ax66, j0By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j0Ax64, j0By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j0Ax64, j0By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j0Ax67, j0By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j0Ax67, j0By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j0Ax65, j0By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j0Ax65, j0By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j0Ax67, j0By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j0Ax67, j0By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j0Ax65, j0By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j0Ax65, j0By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j0Ax66, j0By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j0Ax66, j0By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j0Ax64, j0By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j0Ax64, j0By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j0Ax03, j0By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j0Ax03, j0By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j0Ax01, j0By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j0Ax01, j0By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j0Ax02, j0By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j0Ax02, j0By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j0Ax00, j0By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j0Ax00, j0By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<2*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<2*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<2*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<2*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j1Ax66, j1By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j1Ax66, j1By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j1Ax64, j1By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j1Ax64, j1By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j1Ax67, j1By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j1Ax67, j1By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j1Ax65, j1By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j1Ax65, j1By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j1Ax67, j1By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j1Ax67, j1By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j1Ax65, j1By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j1Ax65, j1By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j1Ax66, j1By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j1Ax66, j1By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j1Ax64, j1By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j1Ax64, j1By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j1Ax02, j1By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j1Ax02, j1By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j1Ax00, j1By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j1Ax00, j1By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j1Ax03, j1By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j1Ax03, j1By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j1Ax01, j1By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j1Ax01, j1By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j1Ax66, j1By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j1Ax66, j1By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j1Ax64, j1By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j1Ax64, j1By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j1Ax67, j1By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j1Ax67, j1By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j1Ax65, j1By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j1Ax65, j1By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j1Ax67, j1By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j1Ax67, j1By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j1Ax65, j1By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j1Ax65, j1By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j1Ax66, j1By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j1Ax66, j1By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j1Ax64, j1By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j1Ax64, j1By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j1Ax03, j1By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j1Ax03, j1By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j1Ax01, j1By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j1Ax01, j1By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j1Ax02, j1By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j1Ax02, j1By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j1Ax00, j1By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j1Ax00, j1By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<3*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<3*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax64, [readAs + 4x<3*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By64, [readBs + 4x<3*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j0Ax66, j0By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j0Ax66, j0By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j0Ax64, j0By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j0Ax64, j0By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j0Ax67, j0By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j0Ax67, j0By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j0Ax65, j0By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j0Ax65, j0By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j0Ax67, j0By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j0Ax67, j0By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j0Ax65, j0By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j0Ax65, j0By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j0Ax66, j0By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j0Ax66, j0By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j0Ax64, j0By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j0Ax64, j0By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j0Ax02, j0By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j0Ax02, j0By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j0Ax00, j0By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j0Ax00, j0By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j0Ax03, j0By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j0Ax03, j0By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j0Ax01, j0By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j0Ax01, j0By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j0Ax66, j0By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j0Ax66, j0By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j0Ax64, j0By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j0Ax64, j0By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j0Ax67, j0By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j0Ax67, j0By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j0Ax65, j0By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j0Ax65, j0By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j0Ax67, j0By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j0Ax67, j0By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j0Ax65, j0By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j0Ax65, j0By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j0Ax66, j0By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j0Ax66, j0By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j0Ax64, j0By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j0Ax64, j0By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j0Ax03, j0By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j0Ax03, j0By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j0Ax01, j0By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j0Ax01, j0By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j0Ax02, j0By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j0Ax02, j0By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j0Ax00, j0By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j0Ax00, j0By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<4*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<4*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<4*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<4*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j1Ax66, j1By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j1Ax66, j1By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j1Ax64, j1By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j1Ax64, j1By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j1Ax67, j1By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j1Ax67, j1By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j1Ax65, j1By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j1Ax65, j1By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j1Ax67, j1By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j1Ax67, j1By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j1Ax65, j1By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j1Ax65, j1By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j1Ax66, j1By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j1Ax66, j1By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j1Ax64, j1By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j1Ax64, j1By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j1Ax02, j1By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j1Ax02, j1By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j1Ax00, j1By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j1Ax00, j1By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j1Ax03, j1By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j1Ax03, j1By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j1Ax01, j1By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j1Ax01, j1By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j1Ax66, j1By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j1Ax66, j1By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j1Ax64, j1By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j1Ax64, j1By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j1Ax67, j1By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j1Ax67, j1By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j1Ax65, j1By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j1Ax65, j1By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j1Ax67, j1By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j1Ax67, j1By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j1Ax65, j1By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j1Ax65, j1By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j1Ax66, j1By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j1Ax66, j1By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j1Ax64, j1By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j1Ax64, j1By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j1Ax03, j1By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j1Ax03, j1By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j1Ax01, j1By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j1Ax01, j1By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j1Ax02, j1By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j1Ax02, j1By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j1Ax00, j1By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j1Ax00, j1By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<5*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<5*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax64, [readAs + 4x<5*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By64, [readBs + 4x<5*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j0Ax66, j0By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j0Ax66, j0By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j0Ax64, j0By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j0Ax64, j0By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j0Ax67, j0By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j0Ax67, j0By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j0Ax65, j0By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j0Ax65, j0By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j0Ax67, j0By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j0Ax67, j0By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j0Ax65, j0By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j0Ax65, j0By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j0Ax66, j0By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j0Ax66, j0By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j0Ax64, j0By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j0Ax64, j0By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j0Ax02, j0By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j0Ax02, j0By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j0Ax00, j0By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j0Ax00, j0By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j0Ax03, j0By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j0Ax03, j0By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j0Ax01, j0By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j0Ax01, j0By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j0Ax66, j0By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j0Ax66, j0By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j0Ax64, j0By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j0Ax64, j0By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j0Ax67, j0By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j0Ax67, j0By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j0Ax65, j0By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j0Ax65, j0By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j0Ax67, j0By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j0Ax67, j0By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j0Ax65, j0By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j0Ax65, j0By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j0Ax66, j0By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j0Ax66, j0By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j0Ax64, j0By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j0Ax64, j0By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j0Ax03, j0By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j0Ax03, j0By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j0Ax01, j0By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j0Ax01, j0By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j0Ax02, j0By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j0Ax02, j0By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j0Ax00, j0By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j0Ax00, j0By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<6*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<6*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<6*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<6*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j1Ax66, j1By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j1Ax66, j1By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j1Ax64, j1By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j1Ax64, j1By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j1Ax67, j1By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j1Ax67, j1By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j1Ax65, j1By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j1Ax65, j1By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j1Ax67, j1By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j1Ax67, j1By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j1Ax65, j1By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j1Ax65, j1By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j1Ax66, j1By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j1Ax66, j1By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j1Ax64, j1By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j1Ax64, j1By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j1Ax02, j1By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j1Ax02, j1By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j1Ax00, j1By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j1Ax00, j1By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j1Ax03, j1By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j1Ax03, j1By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j1Ax01, j1By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j1Ax01, j1By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j1Ax66, j1By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j1Ax66, j1By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j1Ax64, j1By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j1Ax64, j1By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j1Ax67, j1By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j1Ax67, j1By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j1Ax65, j1By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j1Ax65, j1By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j1Ax67, j1By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j1Ax67, j1By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j1Ax65, j1By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j1Ax65, j1By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j1Ax66, j1By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j1Ax66, j1By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j1Ax64, j1By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j1Ax64, j1By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j1Ax03, j1By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j1Ax03, j1By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j1Ax01, j1By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j1Ax01, j1By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j1Ax02, j1By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j1Ax02, j1By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j1Ax00, j1By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j1Ax00, j1By66, cx00y66; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<7*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<7*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax64, [readAs + 4x<7*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By64, [readBs + 4x<7*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j0Ax66, j0By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j0Ax66, j0By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j0Ax64, j0By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j0Ax64, j0By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j0Ax67, j0By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j0Ax67, j0By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j0Ax65, j0By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j0Ax65, j0By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j0Ax67, j0By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j0Ax67, j0By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j0Ax65, j0By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j0Ax65, j0By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j0Ax66, j0By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j0Ax66, j0By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j0Ax64, j0By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j0Ax64, j0By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:0 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +02:-:-:-:1 @P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2 +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j0Ax02, j0By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j0Ax02, j0By65, cx02y65; +--:-:-:-:0 FFMA cx00y65, j0Ax00, j0By65, cx00y65; +04:-:-:-:1 @P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3 +--:-:-:-:1 FFMA cx00y64, j0Ax00, j0By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j0Ax03, j0By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j0Ax03, j0By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j0Ax01, j0By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j0Ax01, j0By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j0Ax66, j0By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j0Ax66, j0By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j0Ax64, j0By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j0Ax64, j0By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j0Ax67, j0By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j0Ax67, j0By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j0Ax65, j0By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j0Ax65, j0By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j0Ax67, j0By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j0Ax67, j0By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j0Ax65, j0By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j0Ax65, j0By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j0Ax66, j0By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j0Ax66, j0By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j0Ax64, j0By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j0Ax64, j0By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j0Ax03, j0By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j0Ax03, j0By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j0Ax01, j0By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j0Ax01, j0By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j0Ax02, j0By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j0Ax02, j0By67, cx02y67; +--:-:-:-:0 FFMA cx00y67, j0Ax00, j0By67, cx00y67; +01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1 +--:-:-:-:1 @P0 LOP.XOR readAs, readAs, 4x<16*128>; +--:-:-:-:1 @P0 LOP.XOR readBs, readBs, 4x<16*128>; +--:-:-:-:1 @P0 LOP.XOR writeS, writeS, 4x<16*128>; +--:-:-:-:1 FFMA cx00y66, j0Ax00, j0By66, cx00y66; +--:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; +--:-:-:-:1 @P0 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 @P0 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 @P0 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 @P0 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx66y00, j1Ax66, j1By00, cx66y00; +--:-:-:-:1 FFMA cx66y01, j1Ax66, j1By01, cx66y01; +--:-:-:-:1 FFMA cx64y01, j1Ax64, j1By01, cx64y01; +--:-:-:-:1 FFMA cx64y00, j1Ax64, j1By00, cx64y00; +--:-:-:-:1 FFMA cx67y00, j1Ax67, j1By00, cx67y00; +--:-:-:-:1 FFMA cx67y01, j1Ax67, j1By01, cx67y01; +--:-:-:-:1 FFMA cx65y01, j1Ax65, j1By01, cx65y01; +--:-:-:-:1 FFMA cx65y00, j1Ax65, j1By00, cx65y00; +--:-:-:-:1 FFMA cx67y02, j1Ax67, j1By02, cx67y02; +--:-:-:-:1 FFMA cx67y03, j1Ax67, j1By03, cx67y03; +--:-:-:-:1 FFMA cx65y03, j1Ax65, j1By03, cx65y03; +--:-:-:-:1 FFMA cx65y02, j1Ax65, j1By02, cx65y02; +--:-:-:-:1 FFMA cx66y02, j1Ax66, j1By02, cx66y02; +--:-:-:-:1 FFMA cx66y03, j1Ax66, j1By03, cx66y03; +--:-:-:-:1 FFMA cx64y03, j1Ax64, j1By03, cx64y03; +--:-:-:-:1 FFMA cx64y02, j1Ax64, j1By02, cx64y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y64, j1Ax02, j1By64, cx02y64; +--:-:-:-:1 FFMA cx02y65, j1Ax02, j1By65, cx02y65; +--:-:-:-:1 FFMA cx00y65, j1Ax00, j1By65, cx00y65; +--:-:-:-:1 FFMA cx00y64, j1Ax00, j1By64, cx00y64; +--:-:-:-:1 FFMA cx03y64, j1Ax03, j1By64, cx03y64; +--:-:-:-:1 FFMA cx03y65, j1Ax03, j1By65, cx03y65; +--:-:-:-:1 FFMA cx01y65, j1Ax01, j1By65, cx01y65; +--:-:-:-:1 FFMA cx01y64, j1Ax01, j1By64, cx01y64; +--:-:-:-:1 FFMA cx66y64, j1Ax66, j1By64, cx66y64; +--:-:-:-:1 FFMA cx66y65, j1Ax66, j1By65, cx66y65; +--:-:-:-:1 FFMA cx64y65, j1Ax64, j1By65, cx64y65; +--:-:-:-:1 FFMA cx64y64, j1Ax64, j1By64, cx64y64; +--:-:-:-:1 FFMA cx67y64, j1Ax67, j1By64, cx67y64; +--:-:-:-:1 FFMA cx67y65, j1Ax67, j1By65, cx67y65; +--:-:-:-:1 FFMA cx65y65, j1Ax65, j1By65, cx65y65; +--:-:-:-:1 FFMA cx65y64, j1Ax65, j1By64, cx65y64; +--:-:-:-:1 FFMA cx67y66, j1Ax67, j1By66, cx67y66; +--:-:-:-:1 FFMA cx67y67, j1Ax67, j1By67, cx67y67; +--:-:-:-:1 FFMA cx65y67, j1Ax65, j1By67, cx65y67; +--:-:-:-:1 FFMA cx65y66, j1Ax65, j1By66, cx65y66; +--:-:-:-:1 FFMA cx66y66, j1Ax66, j1By66, cx66y66; +--:-:-:-:1 FFMA cx66y67, j1Ax66, j1By67, cx66y67; +--:-:-:-:1 FFMA cx64y67, j1Ax64, j1By67, cx64y67; +--:-:-:-:1 FFMA cx64y66, j1Ax64, j1By66, cx64y66; +--:-:-:-:1 FFMA cx03y66, j1Ax03, j1By66, cx03y66; +--:-:-:-:1 FFMA cx03y67, j1Ax03, j1By67, cx03y67; +--:-:-:-:1 FFMA cx01y67, j1Ax01, j1By67, cx01y67; +--:-:-:-:1 FFMA cx01y66, j1Ax01, j1By66, cx01y66; +--:-:-:-:1 FFMA cx02y66, j1Ax02, j1By66, cx02y66; +--:-:-:-:1 FFMA cx02y67, j1Ax02, j1By67, cx02y67; +--:-:-:-:1 FFMA cx00y67, j1Ax00, j1By67, cx00y67; +--:-:-:-:1 FFMA cx00y66, j1Ax00, j1By66, cx00y66; +--:-:-:-:1 @P0 IADD track0, track0, ldx8; +--:-:-:-:0 @P0 IADD track4, track4, ldx8; +--:-:-:Y:5 @P0 BRA LOOP; + +// Main loop is done, time to write C to global memory. +--:-:-:-:1 SHR.U32 cx, tid128, 2; +--:-:-:-:1 MOV ldc, c[0x0][0x158]; +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 MOV alpha, c[0x0][0x15c]; +--:-:-:-:1 SHL readCs, tid96, 4; +--:-:-:-:1 LOP.AND readAs, readAs, 0xfff; +--:-:-:-:1 LOP.OR cx, tid31, cx; +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 LOP.AND readBs, readBs, 0xfff; +--:-:-:-:1 ISCADD cy00, by, cy00, 7; +--:-:-:-:1 FMUL cs0, cx00y00, alpha; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 LOP.OR readCs, readCs, cx; +--:-:-:-:1 ISCADD cx, bx, cx, 7; +--:-:-:-:1 FMUL cs1, cx01y00, alpha; +--:-:-:-:1 SHL ldc8, ldc, 5; +--:-:-:-:1 XMAD.MRG xmad_ci, cy00, ldc.H1, RZ; +--:-:-:-:1 ISCADD writeCs, readBs, readAs, 5; +--:-:-:-:1 FMUL cs2, cx02y00, alpha; +--:-:-:-:1 SHL readCs, readCs, 2; +--:-:-:-:1 XMAD ci, cy00, ldc, cx; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m +--:-:-:-:1 IADD cx, cx, 64; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; +--:-:-:-:1 FMUL cs3, cx03y00, alpha; +--:-:-:-:1 FMUL cs4, cx64y00, alpha; +--:-:-:-:1 XMAD.PSL.CBCC ci, cy00.H1, xmad_ci.H1, ci; +--:-:-:-:1 IADD cy00, cy00, -1; +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m +--:-:-:-:1 FMUL cs5, cx65y00, alpha; +--:-:-:-:1 FMUL cs6, cx66y00, alpha; +--:-:-:-:1 FMUL cs7, cx67y00, alpha; +--:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:3 IADD cy12, cy00, 12; +--:-:-:Y:6 IADD Cy00, Cy00, -ldc1; +--:-:-:-:1 IADD Cy04, Cy00, ldc4; +--:-:-:Y:5 IADD Cy08, Cy00, ldc8; +--:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) + +// There's nothing yet in place to handle dependecies with subroutines. +// So don't schedule this block. +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y01, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y01, alpha; +--:-:-:-:1 FMUL cs2, cx02y01, alpha; +--:-:-:-:1 FMUL cs3, cx03y01, alpha; +--:-:-:-:1 FMUL cs4, cx64y01, alpha; +--:-:-:-:1 FMUL cs5, cx65y01, alpha; +--:-:-:-:1 FMUL cs6, cx66y01, alpha; +--:-:-:-:0 FMUL cs7, cx67y01, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y02, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y02, alpha; +--:-:-:-:1 FMUL cs2, cx02y02, alpha; +--:-:-:-:1 FMUL cs3, cx03y02, alpha; +--:-:-:-:1 FMUL cs4, cx64y02, alpha; +--:-:-:-:1 FMUL cs5, cx65y02, alpha; +--:-:-:-:1 FMUL cs6, cx66y02, alpha; +--:-:-:-:0 FMUL cs7, cx67y02, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y03, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y03, alpha; +--:-:-:-:1 FMUL cs2, cx02y03, alpha; +--:-:-:-:1 FMUL cs3, cx03y03, alpha; +--:-:-:-:1 FMUL cs4, cx64y03, alpha; +--:-:-:-:1 FMUL cs5, cx65y03, alpha; +--:-:-:-:1 FMUL cs6, cx66y03, alpha; +--:-:-:-:0 FMUL cs7, cx67y03, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 IADD cy00, cy00, 60; +--:-:-:-:1 IADD cy04, cy04, 60; +--:-:-:-:1 IADD cy08, cy08, 60; +--:-:-:-:1 IADD cy12, cy12, 60; + +02:-:-:-:1 IADD Cy00, Cy00, ldc60; // Wait Dep 2 +--:-:-:-:1 IADD Cy04, Cy04, ldc60; +--:-:-:-:1 IADD Cy08, Cy08, ldc60; +--:-:-:-:1 IADD Cy12, Cy12, ldc60; + +--:-:-:-:1 FMUL cs0, cx00y64, alpha; +--:-:-:-:1 FMUL cs1, cx01y64, alpha; +--:-:-:-:1 FMUL cs2, cx02y64, alpha; +--:-:-:-:1 FMUL cs3, cx03y64, alpha; +--:-:-:-:1 FMUL cs4, cx64y64, alpha; +--:-:-:-:1 FMUL cs5, cx65y64, alpha; +--:-:-:-:1 FMUL cs6, cx66y64, alpha; +--:-:-:-:0 FMUL cs7, cx67y64, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y65, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y65, alpha; +--:-:-:-:1 FMUL cs2, cx02y65, alpha; +--:-:-:-:1 FMUL cs3, cx03y65, alpha; +--:-:-:-:1 FMUL cs4, cx64y65, alpha; +--:-:-:-:1 FMUL cs5, cx65y65, alpha; +--:-:-:-:1 FMUL cs6, cx66y65, alpha; +--:-:-:-:0 FMUL cs7, cx67y65, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y66, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y66, alpha; +--:-:-:-:1 FMUL cs2, cx02y66, alpha; +--:-:-:-:1 FMUL cs3, cx03y66, alpha; +--:-:-:-:1 FMUL cs4, cx64y66, alpha; +--:-:-:-:1 FMUL cs5, cx65y66, alpha; +--:-:-:-:1 FMUL cs6, cx66y66, alpha; +--:-:-:-:0 FMUL cs7, cx67y66, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y67, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y67, alpha; +--:-:-:-:1 FMUL cs2, cx02y67, alpha; +--:-:-:-:1 FMUL cs3, cx03y67, alpha; +--:-:-:-:1 FMUL cs4, cx64y67, alpha; +--:-:-:-:1 FMUL cs5, cx65y67, alpha; +--:-:-:-:1 FMUL cs6, cx66y67, alpha; +--:-:-:-:0 FMUL cs7, cx67y67, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + + +// And we'd done. The remainder is the STORE_C subroutine that's defined at the end of the kernel. +--:-:-:-:5 EXIT; + +// This routine does warp synchronous shuffling of our output data so as to be able +// to have coalesced writes to global memory. This is actually faster because the shared +// memory latencies can be hidden by other warps and we're only adding a few extra clocks +// to this thread. Global memory here is the bottleneck and being able to half the needed +// bandwidth at the expense of a few clocks is a modest win. This also keeps power lower +// and our chip running faster. + +// Note, the SHFL instruction doesn't help us here because we're swaping different registers +// from different threads. +STORE_C: + +--:-:-:-:0 IADD cy00, cy00, 1; +--:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; +--:-:-:-:0 IADD cy04, cy04, 1; +--:-:-:-:1 STS.128 [writeCs+4x<64>], cs4; +--:-:-:-:0 IADD cy08, cy08, 1; +--:-:-:-:1 LDS cs0, [readCs + 4x<0*128 + 00>]; +--:-:-:-:0 IADD cy12, cy12, 1; +--:-:-:-:1 LDS cs1, [readCs + 4x<0*128 + 64>]; +--:-:-:-:0 IADD Cy00, Cy00, ldc1; +--:-:-:-:1 LDS cs2, [readCs + 4x<1*128 + 00>]; +--:-:-:-:0 IADD Cy04, Cy04, ldc1; +--:-:-:-:1 LDS cs3, [readCs + 4x<1*128 + 64>]; +--:-:-:-:0 IADD Cy08, Cy08, ldc1; +--:-:-:-:1 LDS cs4, [readCs + 4x<2*128 + 00>]; +--:-:-:-:0 IADD Cy12, Cy12, ldc1; +--:-:-:-:1 LDS cs5, [readCs + 4x<2*128 + 64>]; +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m +--:-:-:-:1 LDS cs6, [readCs + 4x<3*128 + 00>]; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m +--:-:1:-:1 LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1 +--:-:-:-:2 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m +--:-:-:Y:7 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m +01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m +--:-:-:-:1 @P1 STG.CG [Cy00 + 4x<64>], cs1; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m +--:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m +--:-:-:-:1 @P3 STG.CG [Cy04 + 4x<64>], cs3; +--:-:-:Y:7 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m +--:-:-:-:2 @P0 STG.CG [Cy08 + 4x<00>], cs4; +--:-:-:-:2 @P1 STG.CG [Cy08 + 4x<64>], cs5; +--:-:-:-:2 @P2 STG.CG [Cy12 + 4x<00>], cs6; +--:2:-:-:1 @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2 + +--:-:-:-:5 RET; + diff --git a/Assembler/PascalAs/sgemm/sgemm_pre_64.sass b/Assembler/PascalAs/sgemm/sgemm_pre_64.sass new file mode 100644 index 0000000..aa2719e --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm_pre_64.sass @@ -0,0 +1,867 @@ +# Kernel: sgemm_kernel_64 +# +# SharedSize: 8192 +# Params(8): +# 0:0x140:4:4 param_C, +# 1:0x144:4:0 param_m, +# 2:0x148:4:0 param_n, +# 3:0x14c:4:0 param_k, +# 4:0x150:4:0 param_lda, +# 5:0x154:4:0 param_ldb, +# 6:0x158:4:0 param_ldc +# 7:0x15c:4:0 param_alpha +# 8:0x160:4:4 param_D // for diagnostic printf output +# +# Globals: +# c[0x0][0x164]: texA (the value is 1) +# c[0x0][0x168]: texB (the value is 0) + + + + 0-63 ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end + + 80 : zOffset + 0-63 : cz<00-63> + + 3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35> + 7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35> + 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35> + 5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35> + 35,34,43,42,51,50,59,58 : cx32y<00-03|32-35> + 39,38,47,46,55,54,63,62 : cx33y<00-03|32-35> + 33,32,41,40,49,48,57,56 : cx34y<00-03|32-35> + 37,36,45,44,53,52,61,60 : cx35y<00-03|32-35> + + 64-79 : j0Ax<00-03|32-35>, j0By<00-03|32-35> + 80-95 : j1Ax<00-03|32-35>, j1By<00-03|32-35> + + 64-71 : cs<0-7> + + 96-111 : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3> + + 112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32 + + 72-111 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX + + + +--:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 +--:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 +--:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 BFE.U32 tid2, tid, 0x104; // 1 bit at position 4 +--:-:-:-:1 MOV k, c[0x0][0x14c]; +--:-:-:-:1 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.AND readBs, tid, 0x30; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tid15_4, tid15, 4; +--:-:-:-:1 LOP.AND zOffset, tid, -32; +--:-:-:-:1 IADD k, k, -8; +--:-:-:-:1 SHL readAs, readAs, 4; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:0 @!P0 MOV ldx4, c[0x0][0x150]; +--:-:-:-:1 STS.128 [zOffset + 4x<16*64>], RZ; +--:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; +--:-:-:-:1 ISCADD writeS, tid2, tid15_4, 8; +06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 +--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA +--:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 SHR.U32 ldx, ldx4, 2; +--:-:-:-:1 LOP.AND tid32, tid, 32; +--:-:-:-:1 ISCADD track0, blk, tid15, 4; +--:-:-:-:1 IADD ldx8, ldx4, ldx4; +--:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*64>; +--:-:-:-:1 ISCADD readBs, readBs, 4x<8*64>, 4; +--:-:-:-:1 XMAD.MRG xmad_t0, ldx, tid2.H1, RZ; +--:-:-:-:1 XMAD.MRG xmad_end, k, ldx.H1, RZ; +--:-:-:Y:6 XMAD track0, ldx, tid2, track0; +--:-:-:-:2 XMAD.PSL.CBCC track0, ldx.H1, xmad_t0.H1, track0; +--:-:1:-:4 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:-:-:1 IADD3 track2, track0, ldx, ldx; +--:-:-:-:1 IADD track4, track0, ldx4; +--:-:2:-:1 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:-:0 XMAD end, k, ldx, track0; +--:-:3:-:3 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1 +--:-:-:-:2 IADD track6, track2, ldx4; +--:-:4:-:1 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:-:1 XMAD.PSL.CBCC end, k.H1, xmad_end.H1, end; + +--:-:5:-:1 LDS.U.128 cz00, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz04, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz08, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz12, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz16, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz20, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz24, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz28, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz32, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz36, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz40, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz44, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz48, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz52, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz56, [zOffset + 4x<16*64>]; +--:-:5:-:1 LDS.U.128 cz60, [zOffset + 4x<16*64>]; + +01:-:-:-:1 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1 +--:-:-:-:0 IADD track0, track0, ldx8; +02:-:-:-:1 STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2 +--:-:-:-:0 IADD track2, track2, ldx8; +04:-:-:-:1 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3 +--:-:-:-:0 IADD track4, track4, ldx8; +08:-:-:-:1 STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4 +--:-:-:-:0 IADD track6, track6, ldx8; +10:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*64>; + +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1 + +// Efficiency: +// ffma: 512 +// lds: 32 dual issued +// sts: 4 dual issued +// tex: 4 dual issued +// add: 4 +// xor: 3 +// setp: 1 +// bar: 1 dual issued +// bra: 1 dual issued +// Total: 520 (512/520 = 98.5% FFMA) + +LOOP: + +// Loop end condition +--:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; + +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<1*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<1*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax32, [readAs + 4x<1*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By32, [readBs + 4x<1*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j0Ax34, j0By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j0Ax34, j0By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j0Ax32, j0By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j0Ax32, j0By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j0Ax35, j0By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j0Ax35, j0By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j0Ax33, j0By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j0Ax33, j0By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j0Ax35, j0By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j0Ax35, j0By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j0Ax33, j0By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j0Ax33, j0By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j0Ax34, j0By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j0Ax34, j0By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j0Ax32, j0By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j0Ax32, j0By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:0 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:-:1 @P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA cx02y32, j0Ax02, j0By32, cx02y32; +--:-:-:-:0 FFMA cx02y33, j0Ax02, j0By33, cx02y33; +--:-:2:-:1 @P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2 +--:-:-:-:1 FFMA cx00y33, j0Ax00, j0By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j0Ax00, j0By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j0Ax03, j0By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j0Ax03, j0By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j0Ax01, j0By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j0Ax01, j0By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j0Ax34, j0By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j0Ax34, j0By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j0Ax32, j0By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j0Ax32, j0By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j0Ax35, j0By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j0Ax35, j0By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j0Ax33, j0By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j0Ax33, j0By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j0Ax35, j0By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j0Ax35, j0By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j0Ax33, j0By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j0Ax33, j0By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j0Ax34, j0By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j0Ax34, j0By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j0Ax32, j0By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j0Ax32, j0By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j0Ax03, j0By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j0Ax03, j0By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j0Ax01, j0By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j0Ax01, j0By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j0Ax02, j0By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j0Ax02, j0By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j0Ax00, j0By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j0Ax00, j0By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<2*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<2*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<2*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<2*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j1Ax34, j1By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j1Ax34, j1By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j1Ax32, j1By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j1Ax32, j1By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j1Ax35, j1By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j1Ax35, j1By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j1Ax33, j1By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j1Ax33, j1By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j1Ax35, j1By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j1Ax35, j1By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j1Ax33, j1By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j1Ax33, j1By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j1Ax34, j1By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j1Ax34, j1By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j1Ax32, j1By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j1Ax32, j1By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:0 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:-:1 @P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; +--:-:-:Y:1 FFMA cx02y32, j1Ax02, j1By32, cx02y32; +--:-:-:-:0 FFMA cx02y33, j1Ax02, j1By33, cx02y33; +--:-:3:-:1 @P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3 +--:-:-:-:1 FFMA cx00y33, j1Ax00, j1By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j1Ax00, j1By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j1Ax03, j1By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j1Ax03, j1By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j1Ax01, j1By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j1Ax01, j1By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j1Ax34, j1By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j1Ax34, j1By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j1Ax32, j1By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j1Ax32, j1By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j1Ax35, j1By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j1Ax35, j1By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j1Ax33, j1By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j1Ax33, j1By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j1Ax35, j1By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j1Ax35, j1By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j1Ax33, j1By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j1Ax33, j1By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j1Ax34, j1By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j1Ax34, j1By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j1Ax32, j1By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j1Ax32, j1By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j1Ax03, j1By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j1Ax03, j1By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j1Ax01, j1By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j1Ax01, j1By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j1Ax02, j1By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j1Ax02, j1By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j1Ax00, j1By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j1Ax00, j1By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<3*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<3*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax32, [readAs + 4x<3*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By32, [readBs + 4x<3*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j0Ax34, j0By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j0Ax34, j0By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j0Ax32, j0By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j0Ax32, j0By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j0Ax35, j0By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j0Ax35, j0By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j0Ax33, j0By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j0Ax33, j0By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j0Ax35, j0By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j0Ax35, j0By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j0Ax33, j0By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j0Ax33, j0By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j0Ax34, j0By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j0Ax34, j0By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j0Ax32, j0By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j0Ax32, j0By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j0Ax02, j0By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j0Ax02, j0By33, cx02y33; +--:-:-:-:1 FFMA cx00y33, j0Ax00, j0By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j0Ax00, j0By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j0Ax03, j0By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j0Ax03, j0By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j0Ax01, j0By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j0Ax01, j0By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j0Ax34, j0By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j0Ax34, j0By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j0Ax32, j0By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j0Ax32, j0By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j0Ax35, j0By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j0Ax35, j0By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j0Ax33, j0By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j0Ax33, j0By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j0Ax35, j0By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j0Ax35, j0By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j0Ax33, j0By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j0Ax33, j0By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j0Ax34, j0By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j0Ax34, j0By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j0Ax32, j0By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j0Ax32, j0By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j0Ax03, j0By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j0Ax03, j0By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j0Ax01, j0By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j0Ax01, j0By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j0Ax02, j0By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j0Ax02, j0By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j0Ax00, j0By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j0Ax00, j0By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<4*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<4*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<4*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<4*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j1Ax34, j1By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j1Ax34, j1By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j1Ax32, j1By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j1Ax32, j1By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j1Ax35, j1By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j1Ax35, j1By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j1Ax33, j1By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j1Ax33, j1By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j1Ax35, j1By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j1Ax35, j1By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j1Ax33, j1By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j1Ax33, j1By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j1Ax34, j1By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j1Ax34, j1By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j1Ax32, j1By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j1Ax32, j1By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j1Ax02, j1By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j1Ax02, j1By33, cx02y33; +--:-:-:-:1 FFMA cx00y33, j1Ax00, j1By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j1Ax00, j1By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j1Ax03, j1By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j1Ax03, j1By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j1Ax01, j1By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j1Ax01, j1By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j1Ax34, j1By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j1Ax34, j1By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j1Ax32, j1By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j1Ax32, j1By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j1Ax35, j1By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j1Ax35, j1By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j1Ax33, j1By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j1Ax33, j1By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j1Ax35, j1By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j1Ax35, j1By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j1Ax33, j1By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j1Ax33, j1By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j1Ax34, j1By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j1Ax34, j1By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j1Ax32, j1By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j1Ax32, j1By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j1Ax03, j1By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j1Ax03, j1By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j1Ax01, j1By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j1Ax01, j1By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j1Ax02, j1By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j1Ax02, j1By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j1Ax00, j1By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j1Ax00, j1By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<5*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<5*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax32, [readAs + 4x<5*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By32, [readBs + 4x<5*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j0Ax34, j0By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j0Ax34, j0By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j0Ax32, j0By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j0Ax32, j0By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j0Ax35, j0By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j0Ax35, j0By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j0Ax33, j0By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j0Ax33, j0By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j0Ax35, j0By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j0Ax35, j0By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j0Ax33, j0By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j0Ax33, j0By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j0Ax34, j0By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j0Ax34, j0By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j0Ax32, j0By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j0Ax32, j0By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j0Ax02, j0By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j0Ax02, j0By33, cx02y33; +--:-:-:-:1 FFMA cx00y33, j0Ax00, j0By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j0Ax00, j0By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j0Ax03, j0By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j0Ax03, j0By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j0Ax01, j0By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j0Ax01, j0By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j0Ax34, j0By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j0Ax34, j0By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j0Ax32, j0By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j0Ax32, j0By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j0Ax35, j0By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j0Ax35, j0By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j0Ax33, j0By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j0Ax33, j0By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j0Ax35, j0By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j0Ax35, j0By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j0Ax33, j0By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j0Ax33, j0By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j0Ax34, j0By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j0Ax34, j0By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j0Ax32, j0By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j0Ax32, j0By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j0Ax03, j0By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j0Ax03, j0By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j0Ax01, j0By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j0Ax01, j0By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j0Ax02, j0By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j0Ax02, j0By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j0Ax00, j0By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j0Ax00, j0By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<6*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<6*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<6*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<6*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j1Ax34, j1By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j1Ax34, j1By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j1Ax32, j1By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j1Ax32, j1By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j1Ax35, j1By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j1Ax35, j1By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j1Ax33, j1By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j1Ax33, j1By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j1Ax35, j1By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j1Ax35, j1By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j1Ax33, j1By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j1Ax33, j1By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j1Ax34, j1By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j1Ax34, j1By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j1Ax32, j1By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j1Ax32, j1By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:0 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +02:-:-:-:1 @P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2 +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j1Ax02, j1By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j1Ax02, j1By33, cx02y33; +--:-:-:-:0 FFMA cx00y33, j1Ax00, j1By33, cx00y33; +--:-:-:-:1 @P0 STS.128 [writeS + 4x<2*64>], loadX2; +--:-:-:-:1 FFMA cx00y32, j1Ax00, j1By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j1Ax03, j1By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j1Ax03, j1By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j1Ax01, j1By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j1Ax01, j1By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j1Ax34, j1By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j1Ax34, j1By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j1Ax32, j1By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j1Ax32, j1By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j1Ax35, j1By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j1Ax35, j1By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j1Ax33, j1By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j1Ax33, j1By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j1Ax35, j1By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j1Ax35, j1By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j1Ax33, j1By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j1Ax33, j1By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j1Ax34, j1By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j1Ax34, j1By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j1Ax32, j1By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j1Ax32, j1By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j1Ax03, j1By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j1Ax03, j1By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j1Ax01, j1By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j1Ax01, j1By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j1Ax02, j1By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j1Ax02, j1By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j1Ax00, j1By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j1Ax00, j1By34, cx00y34; +01:-:-:-:0 FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1 +--:-:-:-:1 LDS.U.128 j1Ax00, [readAs + 4x<7*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j0Ax02, j0By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j0Ax00, j0By01, cx00y01; +--:-:-:-:1 LDS.U.128 j1By00, [readBs + 4x<7*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j0Ax00, j0By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j0Ax03, j0By00, cx03y00; +--:-:-:-:1 LDS.U.128 j1Ax32, [readAs + 4x<7*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j0Ax03, j0By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j0Ax01, j0By01, cx01y01; +--:-:1:-:1 LDS.U.128 j1By32, [readBs + 4x<7*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j0Ax01, j0By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j0Ax34, j0By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j0Ax34, j0By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j0Ax32, j0By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j0Ax32, j0By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j0Ax35, j0By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j0Ax35, j0By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j0Ax33, j0By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j0Ax33, j0By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j0Ax35, j0By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j0Ax35, j0By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j0Ax33, j0By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j0Ax33, j0By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j0Ax34, j0By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j0Ax34, j0By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j0Ax32, j0By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j0Ax32, j0By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j0Ax03, j0By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j0Ax03, j0By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j0Ax01, j0By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j0Ax01, j0By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j0Ax02, j0By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j0Ax02, j0By03, cx02y03; +--:-:-:-:0 FFMA cx00y03, j0Ax00, j0By03, cx00y03; +04:-:-:-:1 @P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3 +--:-:-:-:1 FFMA cx00y02, j0Ax00, j0By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j0Ax02, j0By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j0Ax02, j0By33, cx02y33; +--:-:-:-:0 FFMA cx00y33, j0Ax00, j0By33, cx00y33; +--:-:-:-:1 @P0 STS.128 [writeS + 4x<6*64>], loadX6; +--:-:-:-:1 FFMA cx00y32, j0Ax00, j0By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j0Ax03, j0By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j0Ax03, j0By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j0Ax01, j0By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j0Ax01, j0By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j0Ax34, j0By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j0Ax34, j0By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j0Ax32, j0By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j0Ax32, j0By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j0Ax35, j0By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j0Ax35, j0By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j0Ax33, j0By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j0Ax33, j0By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j0Ax35, j0By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j0Ax35, j0By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j0Ax33, j0By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j0Ax33, j0By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j0Ax34, j0By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j0Ax34, j0By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j0Ax32, j0By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j0Ax32, j0By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j0Ax03, j0By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j0Ax03, j0By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j0Ax01, j0By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j0Ax01, j0By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j0Ax02, j0By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j0Ax02, j0By35, cx02y35; +--:-:-:-:0 FFMA cx00y35, j0Ax00, j0By35, cx00y35; +01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1 +--:-:-:-:1 @P0 LOP.XOR readAs, readAs, 4x<16*64>; +--:-:-:-:1 @P0 LOP.XOR readBs, readBs, 4x<16*64>; +--:-:-:-:1 @P0 LOP.XOR writeS, writeS, 4x<16*64>; +--:-:-:-:1 FFMA cx00y34, j0Ax00, j0By34, cx00y34; +--:-:-:-:0 FFMA cx02y00, j1Ax02, j1By00, cx02y00; +--:-:-:-:1 @P0 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>]; +--:-:-:-:1 FFMA cx02y01, j1Ax02, j1By01, cx02y01; +--:-:-:-:0 FFMA cx00y01, j1Ax00, j1By01, cx00y01; +--:-:-:-:1 @P0 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 FFMA cx00y00, j1Ax00, j1By00, cx00y00; +--:-:-:-:0 FFMA cx03y00, j1Ax03, j1By00, cx03y00; +--:-:-:-:1 @P0 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>]; +--:-:-:-:1 FFMA cx03y01, j1Ax03, j1By01, cx03y01; +--:-:-:-:0 FFMA cx01y01, j1Ax01, j1By01, cx01y01; +--:-:1:-:1 @P0 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1 +--:-:-:-:1 FFMA cx01y00, j1Ax01, j1By00, cx01y00; +--:-:-:-:1 FFMA cx34y00, j1Ax34, j1By00, cx34y00; +--:-:-:-:1 FFMA cx34y01, j1Ax34, j1By01, cx34y01; +--:-:-:-:1 FFMA cx32y01, j1Ax32, j1By01, cx32y01; +--:-:-:-:1 FFMA cx32y00, j1Ax32, j1By00, cx32y00; +--:-:-:-:1 FFMA cx35y00, j1Ax35, j1By00, cx35y00; +--:-:-:-:1 FFMA cx35y01, j1Ax35, j1By01, cx35y01; +--:-:-:-:1 FFMA cx33y01, j1Ax33, j1By01, cx33y01; +--:-:-:-:1 FFMA cx33y00, j1Ax33, j1By00, cx33y00; +--:-:-:-:1 FFMA cx35y02, j1Ax35, j1By02, cx35y02; +--:-:-:-:1 FFMA cx35y03, j1Ax35, j1By03, cx35y03; +--:-:-:-:1 FFMA cx33y03, j1Ax33, j1By03, cx33y03; +--:-:-:-:1 FFMA cx33y02, j1Ax33, j1By02, cx33y02; +--:-:-:-:1 FFMA cx34y02, j1Ax34, j1By02, cx34y02; +--:-:-:-:1 FFMA cx34y03, j1Ax34, j1By03, cx34y03; +--:-:-:-:1 FFMA cx32y03, j1Ax32, j1By03, cx32y03; +--:-:-:-:1 FFMA cx32y02, j1Ax32, j1By02, cx32y02; +--:-:-:-:1 FFMA cx03y02, j1Ax03, j1By02, cx03y02; +--:-:-:-:1 FFMA cx03y03, j1Ax03, j1By03, cx03y03; +--:-:-:-:1 FFMA cx01y03, j1Ax01, j1By03, cx01y03; +--:-:-:-:1 FFMA cx01y02, j1Ax01, j1By02, cx01y02; +--:-:-:-:1 FFMA cx02y02, j1Ax02, j1By02, cx02y02; +--:-:-:-:1 FFMA cx02y03, j1Ax02, j1By03, cx02y03; +--:-:-:-:1 FFMA cx00y03, j1Ax00, j1By03, cx00y03; +--:-:-:-:1 FFMA cx00y02, j1Ax00, j1By02, cx00y02; +--:-:-:Y:1 FFMA cx02y32, j1Ax02, j1By32, cx02y32; +--:-:-:-:1 FFMA cx02y33, j1Ax02, j1By33, cx02y33; +--:-:-:-:1 FFMA cx00y33, j1Ax00, j1By33, cx00y33; +--:-:-:-:1 FFMA cx00y32, j1Ax00, j1By32, cx00y32; +--:-:-:-:1 FFMA cx03y32, j1Ax03, j1By32, cx03y32; +--:-:-:-:1 FFMA cx03y33, j1Ax03, j1By33, cx03y33; +--:-:-:-:1 FFMA cx01y33, j1Ax01, j1By33, cx01y33; +--:-:-:-:1 FFMA cx01y32, j1Ax01, j1By32, cx01y32; +--:-:-:-:1 FFMA cx34y32, j1Ax34, j1By32, cx34y32; +--:-:-:-:1 FFMA cx34y33, j1Ax34, j1By33, cx34y33; +--:-:-:-:1 FFMA cx32y33, j1Ax32, j1By33, cx32y33; +--:-:-:-:1 FFMA cx32y32, j1Ax32, j1By32, cx32y32; +--:-:-:-:1 FFMA cx35y32, j1Ax35, j1By32, cx35y32; +--:-:-:-:1 FFMA cx35y33, j1Ax35, j1By33, cx35y33; +--:-:-:-:1 FFMA cx33y33, j1Ax33, j1By33, cx33y33; +--:-:-:-:1 FFMA cx33y32, j1Ax33, j1By32, cx33y32; +--:-:-:-:1 FFMA cx35y34, j1Ax35, j1By34, cx35y34; +--:-:-:-:1 FFMA cx35y35, j1Ax35, j1By35, cx35y35; +--:-:-:-:1 FFMA cx33y35, j1Ax33, j1By35, cx33y35; +--:-:-:-:1 FFMA cx33y34, j1Ax33, j1By34, cx33y34; +--:-:-:-:1 FFMA cx34y34, j1Ax34, j1By34, cx34y34; +--:-:-:-:1 FFMA cx34y35, j1Ax34, j1By35, cx34y35; +--:-:-:-:1 FFMA cx32y35, j1Ax32, j1By35, cx32y35; +--:-:-:-:1 FFMA cx32y34, j1Ax32, j1By34, cx32y34; +--:-:-:-:1 FFMA cx03y34, j1Ax03, j1By34, cx03y34; +--:-:-:-:1 FFMA cx03y35, j1Ax03, j1By35, cx03y35; +--:-:-:-:1 FFMA cx01y35, j1Ax01, j1By35, cx01y35; +--:-:-:-:1 FFMA cx01y34, j1Ax01, j1By34, cx01y34; +--:-:-:-:1 FFMA cx02y34, j1Ax02, j1By34, cx02y34; +--:-:-:-:1 FFMA cx02y35, j1Ax02, j1By35, cx02y35; +--:-:-:-:1 FFMA cx00y35, j1Ax00, j1By35, cx00y35; +--:-:-:-:1 FFMA cx00y34, j1Ax00, j1By34, cx00y34; +--:-:-:-:1 @P0 IADD track0, track0, ldx8; +--:-:-:-:1 @P0 IADD track2, track2, ldx8; +--:-:-:-:1 @P0 IADD track4, track4, ldx8; +--:-:-:-:0 @P0 IADD track6, track6, ldx8; +--:-:-:Y:5 @P0 BRA LOOP; + +--:-:-:-:1 SHR.U32 cy00, tid32, 1; +--:-:-:-:1 MOV ldc, c[0x0][0x158]; +--:-:-:-:1 ISCADD cx, bx, tid31, 6; +--:-:-:-:1 MOV alpha, c[0x0][0x15c]; +--:-:-:-:1 ISCADD readCs, tid32, tid31, 3; +--:-:-:-:1 LOP.AND readAs, readAs, 0x7ff; +--:-:-:-:1 ISCADD cy00, by, cy00, 6; +--:-:-:-:1 LOP.AND readBs, readBs, 0x7ff; +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 FMUL cs0, cx00y00, alpha; +--:-:-:-:1 SHL ldc8, ldc, 5; +--:-:-:-:1 XMAD.MRG xmad_ci, cy00, ldc.H1, RZ; +--:-:-:-:1 ISCADD writeCs, readBs, readAs, 4; +--:-:-:-:1 XMAD ci, cy00, ldc, cx; +--:-:-:-:1 SHL readCs, readCs, 2; +--:-:-:-:1 IADD cx, cx, 32; +--:-:-:-:1 ISCADD ldc28, ldc, -ldc4, 7; +--:-:-:-:1 FMUL cs1, cx01y00, alpha; +--:-:-:-:1 FMUL cs2, cx02y00, alpha; +--:-:-:-:1 XMAD.PSL.CBCC ci, cy00.H1, xmad_ci.H1, ci; +--:-:-:-:1 IADD cy00, cy00, -1; +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m +--:-:-:-:1 FMUL cs3, cx03y00, alpha; +--:-:-:-:1 FMUL cs4, cx32y00, alpha; +--:-:-:-:1 FMUL cs5, cx33y00, alpha; +--:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:1 FMUL cs6, cx34y00, alpha; +--:-:-:-:1 FMUL cs7, cx35y00, alpha; +--:-:-:Y:6 IADD Cy00, Cy00, -ldc1; +--:-:-:-:1 IADD Cy04, Cy00, ldc4; +--:-:-:Y:5 IADD Cy08, Cy00, ldc8; +--:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) + +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y01, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y01, alpha; +--:-:-:-:1 FMUL cs2, cx02y01, alpha; +--:-:-:-:1 FMUL cs3, cx03y01, alpha; +--:-:-:-:1 FMUL cs4, cx32y01, alpha; +--:-:-:-:1 FMUL cs5, cx33y01, alpha; +--:-:-:-:1 FMUL cs6, cx34y01, alpha; +--:-:-:-:0 FMUL cs7, cx35y01, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y02, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y02, alpha; +--:-:-:-:1 FMUL cs2, cx02y02, alpha; +--:-:-:-:1 FMUL cs3, cx03y02, alpha; +--:-:-:-:1 FMUL cs4, cx32y02, alpha; +--:-:-:-:1 FMUL cs5, cx33y02, alpha; +--:-:-:-:1 FMUL cs6, cx34y02, alpha; +--:-:-:-:0 FMUL cs7, cx35y02, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y03, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y03, alpha; +--:-:-:-:1 FMUL cs2, cx02y03, alpha; +--:-:-:-:1 FMUL cs3, cx03y03, alpha; +--:-:-:-:1 FMUL cs4, cx32y03, alpha; +--:-:-:-:1 FMUL cs5, cx33y03, alpha; +--:-:-:-:1 FMUL cs6, cx34y03, alpha; +--:-:-:-:0 FMUL cs7, cx35y03, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 IADD cy00, cy00, 28; +--:-:-:-:1 IADD cy04, cy04, 28; +--:-:-:-:1 IADD cy08, cy08, 28; +--:-:-:-:1 IADD cy12, cy12, 28; + +02:-:-:-:1 IADD Cy00, Cy00, ldc28; // Wait Dep 2 +--:-:-:-:1 IADD Cy04, Cy04, ldc28; +--:-:-:-:1 IADD Cy08, Cy08, ldc28; +--:-:-:-:1 IADD Cy12, Cy12, ldc28; + +--:-:-:-:1 FMUL cs0, cx00y32, alpha; +--:-:-:-:1 FMUL cs1, cx01y32, alpha; +--:-:-:-:1 FMUL cs2, cx02y32, alpha; +--:-:-:-:1 FMUL cs3, cx03y32, alpha; +--:-:-:-:1 FMUL cs4, cx32y32, alpha; +--:-:-:-:1 FMUL cs5, cx33y32, alpha; +--:-:-:-:1 FMUL cs6, cx34y32, alpha; +--:-:-:-:0 FMUL cs7, cx35y32, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y33, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y33, alpha; +--:-:-:-:1 FMUL cs2, cx02y33, alpha; +--:-:-:-:1 FMUL cs3, cx03y33, alpha; +--:-:-:-:1 FMUL cs4, cx32y33, alpha; +--:-:-:-:1 FMUL cs5, cx33y33, alpha; +--:-:-:-:1 FMUL cs6, cx34y33, alpha; +--:-:-:-:0 FMUL cs7, cx35y33, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y34, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y34, alpha; +--:-:-:-:1 FMUL cs2, cx02y34, alpha; +--:-:-:-:1 FMUL cs3, cx03y34, alpha; +--:-:-:-:1 FMUL cs4, cx32y34, alpha; +--:-:-:-:1 FMUL cs5, cx33y34, alpha; +--:-:-:-:1 FMUL cs6, cx34y34, alpha; +--:-:-:-:0 FMUL cs7, cx35y34, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + +02:-:-:-:1 FMUL cs0, cx00y35, alpha; // Wait Dep 2 +--:-:-:-:1 FMUL cs1, cx01y35, alpha; +--:-:-:-:1 FMUL cs2, cx02y35, alpha; +--:-:-:-:1 FMUL cs3, cx03y35, alpha; +--:-:-:-:1 FMUL cs4, cx32y35, alpha; +--:-:-:-:1 FMUL cs5, cx33y35, alpha; +--:-:-:-:1 FMUL cs6, cx34y35, alpha; +--:-:-:-:0 FMUL cs7, cx35y35, alpha; // Dual Issue +--:-:-:-:5 CAL STORE_C; + + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:0 IADD cy00, cy00, 1; +--:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; +--:-:-:-:0 IADD cy04, cy04, 1; +--:-:-:-:1 STS.128 [writeCs+4x<32>], cs4; +--:-:-:-:0 IADD cy08, cy08, 1; +--:-:-:-:1 LDS cs0, [readCs + 4x<0*64 + 00>]; +--:-:-:-:0 IADD cy12, cy12, 1; +--:-:-:-:1 LDS cs1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:0 IADD Cy00, Cy00, ldc1; +--:-:-:-:1 LDS cs2, [readCs + 4x<1*64 + 00>]; +--:-:-:-:0 IADD Cy04, Cy04, ldc1; +--:-:-:-:1 LDS cs3, [readCs + 4x<1*64 + 32>]; +--:-:-:-:0 IADD Cy08, Cy08, ldc1; +--:-:-:-:1 LDS cs4, [readCs + 4x<2*64 + 00>]; +--:-:-:-:0 IADD Cy12, Cy12, ldc1; +--:-:-:-:1 LDS cs5, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m +--:-:-:-:1 LDS cs6, [readCs + 4x<3*64 + 00>]; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m +--:-:1:-:1 LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1 +--:-:-:-:2 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m +--:-:-:Y:7 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m +01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m +--:-:-:-:1 @P1 STG.CG [Cy00 + 4x<32>], cs1; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m +--:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m +--:-:-:-:1 @P3 STG.CG [Cy04 + 4x<32>], cs3; +--:-:-:Y:7 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m +--:-:-:-:2 @P0 STG.CG [Cy08 + 4x<00>], cs4; +--:-:-:-:2 @P1 STG.CG [Cy08 + 4x<32>], cs5; +--:-:-:-:2 @P2 STG.CG [Cy12 + 4x<00>], cs6; +--:2:-:-:1 @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2 + +--:-:-:-:5 RET; + diff --git a/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin b/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin new file mode 100644 index 0000000..0c7825f Binary files /dev/null and b/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin differ diff --git a/Assembler/PascalAs/sgemm/sgemm_sm52_64_dump.sass b/Assembler/PascalAs/sgemm/sgemm_sm52_64_dump.sass new file mode 100644 index 0000000..552d95b --- /dev/null +++ b/Assembler/PascalAs/sgemm/sgemm_sm52_64_dump.sass @@ -0,0 +1,1100 @@ + + code for sm_52 + Function : sgemm_kernel_128 + .headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)" + /* 0x001ffc00e22007f6 */ + /*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */ + /*0010*/ S2R R8, SR_TID.X; /* 0xf0c8000002170008 */ + /*0018*/ SSY 0x90; /* 0xe290000007000000 */ + /* 0x001fc400ffa00fed */ + /*0028*/ ISETP.GT.AND P0, PT, R8, 0x7f, PT; /* 0x3669038007f70807 */ + /*0030*/ @!P0 BRA 0x60; /* 0xe24000000288000f */ + /*0038*/ MOV R0, c[0x0][0x170]; /* 0x4c98078005c70000 */ + /* 0x001ff400fe0007f5 */ + /*0048*/ MOV32I R3, 0x20000000; /* 0x010200000007f003 */ + /*0050*/ { LOP32I.OR R2, R0, 0x80000000; /* 0x0428000000070002 */ + /*0058*/ SYNC; } /* 0xf0f800000007000f */ + /* 0x001fc000fea007f1 */ + /*0068*/ MOV R0, c[0x0][0x174]; /* 0x4c98078005d70000 */ + /*0070*/ MOV32I R3, 0x20000000; /* 0x010200000007f003 */ + /*0078*/ { LOP32I.OR R2, R0, 0x80000000; /* 0x0428000000070002 */ + /*0088*/ SYNC; } /* 0x001fd0800e2007fd */ + /* 0xf0f800000007000f */ + /*0090*/ TLD.B.LZ.NODEP.P R4, R8, R2, 0x0, 1D, 0xf; /* 0xdd3a000780270804 */ + /*0098*/ SHL R0, R8, 0x4; /* 0x3848000000470800 */ + /* 0x081fc403ffe041f2 */ + /*00a8*/ STS.128 [R0], R4; /* 0xef5e000000070004 */ + /*00b0*/ BAR.SYNC 0x0; /* 0xf0a81b8000070000 */ + /*00b8*/ IADD32I R0, -R8.reuse, 0xff; /* 0x1d0000000ff70800 */ + /* 0x001fc000fe8207f5 */ + /*00c8*/ SHL R2, R8.reuse, 0x2; /* 0x3848000000270802 */ + /*00d0*/ SHL R0, R0, 0x4; /* 0x3848000000470000 */ + /*00d8*/ { IADD R4.CC, R2, c[0x0][0x140]; /* 0x4c10800005070204 */ + /*00e8*/ LDS.U.32 R0, [R0]; } /* 0x001fc400fec00711 */ + /* 0xef4c100000070000 */ + /*00f0*/ SHR R2, R8, 0x1e; /* 0x3829000001e70802 */ + /*00f8*/ IADD.X R3, R2, c[0x0][0x144]; /* 0x4c10080005170203 */ + /* 0x001ffc011e2007ff */ + /*0108*/ MOV R2, R4; /* 0x5c98078000470002 */ + /*0110*/ STG.E [R2], R0; /* 0xeedc200000070200 */ + /*0118*/ EXIT; /* 0xe30000000007000f */ + /* 0x001f8000fc0007ff */ + /*0128*/ BRA 0x120; /* 0xe2400fffff07000f */ + /*0130*/ NOP; /* 0x50b0000000070f00 */ + /*0138*/ NOP; /* 0x50b0000000070f00 */ + ................................. + + + Function : sgemm_kernel_64 + .headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)" + /* 0x001d4400e6200711 */ + /*0008*/ S2R R119, SR_TID.X; /* 0xf0c8000002170077 */ + /*0010*/ S2R R125, SR_CTAID.X; /* 0xf0c800000257007d */ + /*0018*/ S2R R122, SR_CTAID.Y; /* 0xf0c800000267007a */ + /* 0x081fc440fe220ff1 */ + /*0028*/ ISETP.GE.AND P0, PT, R119.reuse, 0x20, PT; /* 0x366d038002077707 */ + /*0030*/ LOP.AND R9, R119.reuse, 0xf; /* 0x3847000000f77709 */ + /*0038*/ BFE.U32 R4, R119.reuse, 0x104; /* 0x3800000010477704 */ + /* 0x081fc440fe2007f1 */ + /*0048*/ MOV R12, c[0x0][0x14c]; /* 0x4c9807800537000c */ + /*0050*/ BFE.U32 R114, R119.reuse, 0x301; /* 0x3800000030177772 */ + /*0058*/ LOP.AND R115, R119.reuse, 0x30; /* 0x3847000003077773 */ + /* 0x081fc400fe2207f1 */ + /*0068*/ LOP.AND R0, R119.reuse, 0x1; /* 0x3847000000177700 */ + /*0070*/ SHL R13, R9, 0x4; /* 0x384800000047090d */ + /*0078*/ LOP.AND R80, R119.reuse, -0x20; /* 0x3947007ffe077750 */ + /* 0x081fc400fe2007f1 */ + /*0088*/ IADD R12, R12, -0x8; /* 0x3910007fff870c0c */ + /*0090*/ SHL R114, R114, 0x4; /* 0x3848000000477272 */ + /*0098*/ LOP.AND R126, R119.reuse, 0x1f; /* 0x3847000001f7777e */ + /* 0x001fc400fe2007f0 */ + /*00a8*/ { SHR.U32 R115, R115, 0x3; /* 0x3828000000377373 */ + /*00b0*/ STS.128 [R80+0x1000], RZ; } /* 0xef5e0001000750ff */ + /*00b8*/ @!P0 MOV R2, c[0x0][0x150]; /* 0x4c98078005480002 */ + /* 0x00dfc400fe2007f1 */ + /*00c8*/ ISCADD R118, R4, R13, 0x8; /* 0x5c18040000d70476 */ + /*00d0*/ @P0 MOV R2, c[0x0][0x154]; /* 0x4c98078005500002 */ + /*00d8*/ SEL R8, R122, R125, P0; /* 0x5ca0000007d77a08 */ + /* 0x001fc400fe2007f1 */ + /*00e8*/ @!P0 MOV32I R113, 0x80000001; /* 0x010800000018f071 */ + /*00f0*/ @P0 MOV32I R113, 0x80000000; /* 0x010800000000f071 */ + /*00f8*/ LOP.OR R115, R115, R0; /* 0x5c47020000077373 */ + /* 0x001fc440fe2007f1 */ + /*0108*/ LOP.AND R123, R119, 0x20; /* 0x384700000207777b */ + /*0110*/ SHR.U32 R1, R2.reuse, 0x2; /* 0x3828000000270201 */ + /*0118*/ IADD R121, R2, R2; /* 0x5c10000000270279 */ + /* 0x001fc800fe2007f1 */ + /*0128*/ ISCADD R112, R8, R9, 0x4; /* 0x5c18020000970870 */ + /*0130*/ @P0 IADD R118, R118, 0x800; /* 0x3810000080007676 */ + /*0138*/ ISCADD R115, R115, 0x800, 0x4; /* 0x3818020080077373 */ + /* 0x081f98c0fe2607f1 */ + /*0148*/ XMAD.MRG R5, R1.reuse, R4.H1.reuse, RZ; /* 0x5b007fa800470105 */ + /*0150*/ XMAD.MRG R16, R12.reuse, R1.H1.reuse, RZ; /* 0x5b007fa800170c10 */ + /*0158*/ XMAD R112, R1.reuse, R4, R112; /* 0x5b00380000470170 */ + /* 0x181fc480e28007f2 */ + /*0168*/ XMAD.PSL.CBCC R112, R1.H1, R5.H1, R112; /* 0x5b30381800570170 */ + /*0170*/ TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf; /* 0xdd38000787177060 */ + /*0178*/ IADD3 R116, R112.reuse, R1.reuse, R1; /* 0x5cc0008000177074 */ + /* 0x081fc080e62407f1 */ + /*0188*/ IADD R120, R112, R2.reuse; /* 0x5c10000000277078 */ + /*0190*/ TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf; /* 0xdd38000787177464 */ + /*0198*/ { XMAD R117, R12.reuse, R1, R112; /* 0x5b00380000170c75 */ + /*01a8*/ TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf; } /* 0x101dc400fe440753 */ + /* 0xdd38000787177868 */ + /*01b0*/ IADD R124, R116, R2; /* 0x5c1000000027747c */ + /*01b8*/ TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf; /* 0xdd38000787177c6c */ + /* 0x001e4400f22007f1 */ + /*01c8*/ XMAD.PSL.CBCC R117, R12.H1, R16.H1, R117; /* 0x5b303a9801070c75 */ + /*01d0*/ LDS.U.128 R0, [R80+0x1000]; /* 0xef4e100100075000 */ + /*01d8*/ LDS.U.128 R4, [R80+0x1000]; /* 0xef4e100100075004 */ + /* 0x001e4400f2200791 */ + /*01e8*/ LDS.U.128 R8, [R80+0x1000]; /* 0xef4e100100075008 */ + /*01f0*/ LDS.U.128 R12, [R80+0x1000]; /* 0xef4e10010007500c */ + /*01f8*/ LDS.U.128 R16, [R80+0x1000]; /* 0xef4e100100075010 */ + /* 0x001e4400f2200791 */ + /*0208*/ LDS.U.128 R20, [R80+0x1000]; /* 0xef4e100100075014 */ + /*0210*/ LDS.U.128 R24, [R80+0x1000]; /* 0xef4e100100075018 */ + /*0218*/ LDS.U.128 R28, [R80+0x1000]; /* 0xef4e10010007501c */ + /* 0x001e4400f2200791 */ + /*0228*/ LDS.U.128 R32, [R80+0x1000]; /* 0xef4e100100075020 */ + /*0230*/ LDS.U.128 R36, [R80+0x1000]; /* 0xef4e100100075024 */ + /*0238*/ LDS.U.128 R40, [R80+0x1000]; /* 0xef4e100100075028 */ + /* 0x001e4400f2200791 */ + /*0248*/ LDS.U.128 R44, [R80+0x1000]; /* 0xef4e10010007502c */ + /*0250*/ LDS.U.128 R48, [R80+0x1000]; /* 0xef4e100100075030 */ + /*0258*/ LDS.U.128 R52, [R80+0x1000]; /* 0xef4e100100075034 */ + /* 0x003fc400f2200791 */ + /*0268*/ LDS.U.128 R56, [R80+0x1000]; /* 0xef4e100100075038 */ + /*0270*/ LDS.U.128 R60, [R80+0x1000]; /* 0xef4e10010007503c */ + /*0278*/ STS.128 [R118], R96; /* 0xef5e000000077660 */ + /* 0x101fc002fe2407f0 */ + /*0288*/ { IADD R112, R112, R121.reuse; /* 0x5c10000007977070 */ + /*0290*/ STS.128 [R118+0x200], R100; } /* 0xef5e000020077664 */ + /*0298*/ { IADD R116, R116, R121.reuse; /* 0x5c10000007977474 */ + /*02a8*/ STS.128 [R118+0x400], R104; } /* 0x011fc480fe0027f1 */ + /* 0xef5e000040077668 */ + /*02b0*/ { IADD R120, R120, R121.reuse; /* 0x5c10000007977878 */ + /*02b8*/ STS.128 [R118+0x600], R108; } /* 0xef5e00006007766c */ + /* 0x001fc010fea007f0 */ + /*02c8*/ { IADD R124, R124, R121; /* 0x5c10000007977c7c */ + /*02d0*/ BAR.SYNC 0x0; } /* 0xf0a81b8000070000 */ + /*02d8*/ { LOP.XOR R118, R118, 0x1000; /* 0x3847040100077676 */ + /*02e8*/ LDS.U.128 R64, [R114]; } /* 0x001fc400fe2007f1 */ + /* 0xef4e100000077240 */ + /*02f0*/ LDS.U.128 R72, [R115]; /* 0xef4e100000077348 */ + /*02f8*/ LDS.U.128 R68, [R114+0x80]; /* 0xef4e100008077244 */ + /* 0x183fc000fe200711 */ + /*0308*/ LDS.U.128 R76, [R115+0x80]; /* 0xef4e10000807734c */ + /*0310*/ ISETP.LE.AND P0, PT, R112, R117, PT; /* 0x5b67038007577007 */ + /*0318*/ { FFMA R1, R66.reuse, R72.reuse, R1; /* 0x5980008004874201 */ + /*0328*/ LDS.U.128 R80, [R114+0x100]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100010077250 */ + /*0330*/ FFMA R0, R66, R73.reuse, R0; /* 0x5980000004974200 */ + /*0338*/ { FFMA R2, R64.reuse, R73.reuse, R2; /* 0x5980010004974002 */ + /*0348*/ LDS.U.128 R88, [R115+0x100]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100010077358 */ + /*0350*/ FFMA R3, R64, R72.reuse, R3; /* 0x5980018004874003 */ + /*0358*/ { FFMA R5, R67.reuse, R72.reuse, R5; /* 0x5980028004874305 */ + /*0368*/ LDS.U.128 R84, [R114+0x180]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100018077254 */ + /*0370*/ FFMA R4, R67, R73.reuse, R4; /* 0x5980020004974304 */ + /*0378*/ { FFMA R6, R65.reuse, R73.reuse, R6; /* 0x5980030004974106 */ + /*0388*/ LDS.U.128 R92, [R115+0x180]; } /* 0x181fc480fe200711 */ + /* 0xef4e10001807735c */ + /*0390*/ FFMA R7, R65, R72.reuse, R7; /* 0x5980038004874107 */ + /*0398*/ FFMA R33, R70.reuse, R72.reuse, R33; /* 0x5980108004874621 */ + /* 0x101fc4c0fe2407f1 */ + /*03a8*/ FFMA R32, R70, R73.reuse, R32; /* 0x5980100004974620 */ + /*03b0*/ FFMA R34, R68.reuse, R73.reuse, R34; /* 0x5980110004974422 */ + /*03b8*/ FFMA R35, R68, R72.reuse, R35; /* 0x5980118004874423 */ + /* 0x081fc4c0fe2607f1 */ + /*03c8*/ FFMA R37, R71.reuse, R72.reuse, R37; /* 0x5980128004874725 */ + /*03d0*/ FFMA R36, R71.reuse, R73.reuse, R36; /* 0x5980120004974724 */ + /*03d8*/ FFMA R38, R69.reuse, R73, R38; /* 0x5980130004974526 */ + /* 0x101fc4c0fe2207f1 */ + /*03e8*/ FFMA R39, R69.reuse, R72, R39; /* 0x5980138004874527 */ + /*03f0*/ FFMA R45, R71.reuse, R74.reuse, R45; /* 0x5980168004a7472d */ + /*03f8*/ FFMA R44, R71, R75.reuse, R44; /* 0x5980160004b7472c */ + /* 0x181fc480fe2607f1 */ + /*0408*/ FFMA R46, R69.reuse, R75.reuse, R46; /* 0x5980170004b7452e */ + /*0410*/ FFMA R47, R69, R74.reuse, R47; /* 0x5980178004a7452f */ + /*0418*/ FFMA R41, R70.reuse, R74.reuse, R41; /* 0x5980148004a74629 */ + /* 0x101fc4c0fe2407f1 */ + /*0428*/ FFMA R40, R70, R75.reuse, R40; /* 0x5980140004b74628 */ + /*0430*/ FFMA R42, R68.reuse, R75.reuse, R42; /* 0x5980150004b7442a */ + /*0438*/ FFMA R43, R68, R74.reuse, R43; /* 0x5980158004a7442b */ + /* 0x181fc480fe2607f1 */ + /*0448*/ FFMA R13, R67.reuse, R74.reuse, R13; /* 0x5980068004a7430d */ + /*0450*/ FFMA R12, R67, R75.reuse, R12; /* 0x5980060004b7430c */ + /*0458*/ FFMA R14, R65.reuse, R75.reuse, R14; /* 0x5980070004b7410e */ + /* 0x181fc4c0fe2407f1 */ + /*0468*/ FFMA R15, R65, R74.reuse, R15; /* 0x5980078004a7410f */ + /*0470*/ FFMA R9, R66.reuse, R74.reuse, R9; /* 0x5980048004a74209 */ + /*0478*/ FFMA R8, R66.reuse, R75.reuse, R8; /* 0x5980040004b74208 */ + /* 0x101fc440fe0207f1 */ + /*0488*/ FFMA R10, R64.reuse, R75, R10; /* 0x5980050004b7400a */ + /*0490*/ { FFMA R11, R64.reuse, R74, R11; /* 0x5980058004a7400b */ + /*0498*/ @P0 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf; } /* 0xdd38000787107060 */ + /* 0x101cc480fe0607e1 */ + /*04a8*/ FFMA R17, R66.reuse, R76.reuse, R17; /* 0x5980088004c74211 */ + /*04b0*/ { FFMA R16, R66, R77.reuse, R16; /* 0x5980080004d74210 */ + /*04b8*/ @P0 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf; } /* 0xdd38000787107464 */ + /* 0x181fc480fe2607f1 */ + /*04c8*/ FFMA R18, R64.reuse, R77.reuse, R18; /* 0x5980090004d74012 */ + /*04d0*/ FFMA R19, R64, R76.reuse, R19; /* 0x5980098004c74013 */ + /*04d8*/ FFMA R21, R67.reuse, R76.reuse, R21; /* 0x59800a8004c74315 */ + /* 0x101fc4c0fe2407f1 */ + /*04e8*/ FFMA R20, R67, R77.reuse, R20; /* 0x59800a0004d74314 */ + /*04f0*/ FFMA R22, R65.reuse, R77.reuse, R22; /* 0x59800b0004d74116 */ + /*04f8*/ FFMA R23, R65, R76.reuse, R23; /* 0x59800b8004c74117 */ + /* 0x181fc480fe2607f1 */ + /*0508*/ FFMA R49, R70.reuse, R76.reuse, R49; /* 0x5980188004c74631 */ + /*0510*/ FFMA R48, R70, R77.reuse, R48; /* 0x5980180004d74630 */ + /*0518*/ FFMA R50, R68.reuse, R77.reuse, R50; /* 0x5980190004d74432 */ + /* 0x181fc4c0fe2407f1 */ + /*0528*/ FFMA R51, R68, R76.reuse, R51; /* 0x5980198004c74433 */ + /*0530*/ FFMA R53, R71.reuse, R76.reuse, R53; /* 0x59801a8004c74735 */ + /*0538*/ FFMA R52, R71.reuse, R77.reuse, R52; /* 0x59801a0004d74734 */ + /* 0x181fc440fe2207f1 */ + /*0548*/ FFMA R54, R69.reuse, R77, R54; /* 0x59801b0004d74536 */ + /*0550*/ FFMA R55, R69.reuse, R76, R55; /* 0x59801b8004c74537 */ + /*0558*/ FFMA R61, R71.reuse, R78.reuse, R61; /* 0x59801e8004e7473d */ + /* 0x101fc4c0fe2407f1 */ + /*0568*/ FFMA R60, R71, R79.reuse, R60; /* 0x59801e0004f7473c */ + /*0570*/ FFMA R62, R69.reuse, R79.reuse, R62; /* 0x59801f0004f7453e */ + /*0578*/ FFMA R63, R69, R78.reuse, R63; /* 0x59801f8004e7453f */ + /* 0x181fc480fe2607f1 */ + /*0588*/ FFMA R57, R70.reuse, R78.reuse, R57; /* 0x59801c8004e74639 */ + /*0590*/ FFMA R56, R70, R79.reuse, R56; /* 0x59801c0004f74638 */ + /*0598*/ FFMA R58, R68.reuse, R79.reuse, R58; /* 0x59801d0004f7443a */ + /* 0x101fc4c0fe2407f1 */ + /*05a8*/ FFMA R59, R68, R78.reuse, R59; /* 0x59801d8004e7443b */ + /*05b0*/ FFMA R29, R67.reuse, R78.reuse, R29; /* 0x59800e8004e7431d */ + /*05b8*/ FFMA R28, R67, R79.reuse, R28; /* 0x59800e0004f7431c */ + /* 0x181fc480fe2607f1 */ + /*05c8*/ FFMA R30, R65.reuse, R79.reuse, R30; /* 0x59800f0004f7411e */ + /*05d0*/ FFMA R31, R65, R78.reuse, R31; /* 0x59800f8004e7411f */ + /*05d8*/ FFMA R25, R66.reuse, R78.reuse, R25; /* 0x59800c8004e74219 */ + /* 0x001fc440fe2407f1 */ + /*05e8*/ FFMA R24, R66, R79.reuse, R24; /* 0x59800c0004f74218 */ + /*05f0*/ FFMA R26, R64.reuse, R79, R26; /* 0x59800d0004f7401a */ + /*05f8*/ FFMA R27, R64, R78, R27; /* 0x59800d8004e7401b */ + /* 0x101fc400fe260ff0 */ + /*0608*/ { FFMA R1, R82.reuse, R88.reuse, R1; /* 0x5980008005875201 */ + /*0610*/ LDS.U.128 R64, [R114+0x200]; } /* 0xef4e100020077240 */ + /*0618*/ FFMA R0, R82, R89.reuse, R0; /* 0x5980000005975200 */ + /* 0x101fc400fe2607f0 */ + /*0628*/ { FFMA R2, R80.reuse, R89.reuse, R2; /* 0x5980010005975002 */ + /*0630*/ LDS.U.128 R72, [R115+0x200]; } /* 0xef4e100020077348 */ + /*0638*/ FFMA R3, R80, R88.reuse, R3; /* 0x5980018005875003 */ + /* 0x101fc400fe2607f0 */ + /*0648*/ { FFMA R5, R83.reuse, R88.reuse, R5; /* 0x5980028005875305 */ + /*0650*/ LDS.U.128 R68, [R114+0x280]; } /* 0xef4e100028077244 */ + /*0658*/ FFMA R4, R83, R89.reuse, R4; /* 0x5980020005975304 */ + /* 0x101fc400e22607f0 */ + /*0668*/ { FFMA R6, R81.reuse, R89.reuse, R6; /* 0x5980030005975106 */ + /*0670*/ LDS.U.128 R76, [R115+0x280]; } /* 0xef4e10002807734c */ + /*0678*/ FFMA R7, R81, R88.reuse, R7; /* 0x5980038005875107 */ + /* 0x181fc480fe2607f1 */ + /*0688*/ FFMA R33, R86.reuse, R88.reuse, R33; /* 0x5980108005875621 */ + /*0690*/ FFMA R32, R86, R89.reuse, R32; /* 0x5980100005975620 */ + /*0698*/ FFMA R34, R84.reuse, R89.reuse, R34; /* 0x5980110005975422 */ + /* 0x181fc4c0fe2407f1 */ + /*06a8*/ FFMA R35, R84, R88.reuse, R35; /* 0x5980118005875423 */ + /*06b0*/ FFMA R37, R87.reuse, R88.reuse, R37; /* 0x5980128005875725 */ + /*06b8*/ FFMA R36, R87.reuse, R89.reuse, R36; /* 0x5980120005975724 */ + /* 0x181fc440fe2207f1 */ + /*06c8*/ FFMA R38, R85.reuse, R89, R38; /* 0x5980130005975526 */ + /*06d0*/ FFMA R39, R85.reuse, R88, R39; /* 0x5980138005875527 */ + /*06d8*/ FFMA R45, R87.reuse, R90.reuse, R45; /* 0x5980168005a7572d */ + /* 0x101fc4c0fe2407f1 */ + /*06e8*/ FFMA R44, R87, R91.reuse, R44; /* 0x5980160005b7572c */ + /*06f0*/ FFMA R46, R85.reuse, R91.reuse, R46; /* 0x5980170005b7552e */ + /*06f8*/ FFMA R47, R85, R90.reuse, R47; /* 0x5980178005a7552f */ + /* 0x181fc480fe2607f1 */ + /*0708*/ FFMA R41, R86.reuse, R90.reuse, R41; /* 0x5980148005a75629 */ + /*0710*/ FFMA R40, R86, R91.reuse, R40; /* 0x5980140005b75628 */ + /*0718*/ FFMA R42, R84.reuse, R91.reuse, R42; /* 0x5980150005b7542a */ + /* 0x101fc4c0fe2407f1 */ + /*0728*/ FFMA R43, R84, R90.reuse, R43; /* 0x5980158005a7542b */ + /*0730*/ FFMA R13, R83.reuse, R90.reuse, R13; /* 0x5980068005a7530d */ + /*0738*/ FFMA R12, R83, R91.reuse, R12; /* 0x5980060005b7530c */ + /* 0x181fc480fe2607f1 */ + /*0748*/ FFMA R14, R81.reuse, R91.reuse, R14; /* 0x5980070005b7510e */ + /*0750*/ FFMA R15, R81, R90.reuse, R15; /* 0x5980078005a7510f */ + /*0758*/ FFMA R9, R82.reuse, R90.reuse, R9; /* 0x5980048005a75209 */ + /* 0x081fc040fe2607f1 */ + /*0768*/ FFMA R8, R82.reuse, R91.reuse, R8; /* 0x5980040005b75208 */ + /*0770*/ FFMA R10, R80.reuse, R91, R10; /* 0x5980050005b7500a */ + /*0778*/ { FFMA R11, R80.reuse, R90, R11; /* 0x5980058005a7500b */ + /*0788*/ @P0 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf; } /* 0x101fc0c0fc2407f1 */ + /* 0xdd38000787107868 */ + /*0790*/ FFMA R17, R82.reuse, R92.reuse, R17; /* 0x5980088005c75211 */ + /*0798*/ { FFMA R16, R82, R93.reuse, R16; /* 0x5980080005d75210 */ + /*07a8*/ @P0 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf; } /* 0x101fc4c0fe240751 */ + /* 0xdd38000787107c6c */ + /*07b0*/ FFMA R18, R80.reuse, R93.reuse, R18; /* 0x5980090005d75012 */ + /*07b8*/ FFMA R19, R80, R92.reuse, R19; /* 0x5980098005c75013 */ + /* 0x181fc480fe2607f1 */ + /*07c8*/ FFMA R21, R83.reuse, R92.reuse, R21; /* 0x59800a8005c75315 */ + /*07d0*/ FFMA R20, R83, R93.reuse, R20; /* 0x59800a0005d75314 */ + /*07d8*/ FFMA R22, R81.reuse, R93.reuse, R22; /* 0x59800b0005d75116 */ + /* 0x101fc4c0fe2407f1 */ + /*07e8*/ FFMA R23, R81, R92.reuse, R23; /* 0x59800b8005c75117 */ + /*07f0*/ FFMA R49, R86.reuse, R92.reuse, R49; /* 0x5980188005c75631 */ + /*07f8*/ FFMA R48, R86, R93.reuse, R48; /* 0x5980180005d75630 */ + /* 0x181fc480fe2607f1 */ + /*0808*/ FFMA R50, R84.reuse, R93.reuse, R50; /* 0x5980190005d75432 */ + /*0810*/ FFMA R51, R84, R92.reuse, R51; /* 0x5980198005c75433 */ + /*0818*/ FFMA R53, R87.reuse, R92.reuse, R53; /* 0x59801a8005c75735 */ + /* 0x081fc440fe2607f1 */ + /*0828*/ FFMA R52, R87.reuse, R93.reuse, R52; /* 0x59801a0005d75734 */ + /*0830*/ FFMA R54, R85.reuse, R93, R54; /* 0x59801b0005d75536 */ + /*0838*/ FFMA R55, R85.reuse, R92, R55; /* 0x59801b8005c75537 */ + /* 0x181fc480fe2607f1 */ + /*0848*/ FFMA R61, R87.reuse, R94.reuse, R61; /* 0x59801e8005e7573d */ + /*0850*/ FFMA R60, R87, R95.reuse, R60; /* 0x59801e0005f7573c */ + /*0858*/ FFMA R62, R85.reuse, R95.reuse, R62; /* 0x59801f0005f7553e */ + /* 0x101fc4c0fe2407f1 */ + /*0868*/ FFMA R63, R85, R94.reuse, R63; /* 0x59801f8005e7553f */ + /*0870*/ FFMA R57, R86.reuse, R94.reuse, R57; /* 0x59801c8005e75639 */ + /*0878*/ FFMA R56, R86, R95.reuse, R56; /* 0x59801c0005f75638 */ + /* 0x181fc480fe2607f1 */ + /*0888*/ FFMA R58, R84.reuse, R95.reuse, R58; /* 0x59801d0005f7543a */ + /*0890*/ FFMA R59, R84, R94.reuse, R59; /* 0x59801d8005e7543b */ + /*0898*/ FFMA R29, R83.reuse, R94.reuse, R29; /* 0x59800e8005e7531d */ + /* 0x101fc4c0fe2407f1 */ + /*08a8*/ FFMA R28, R83, R95.reuse, R28; /* 0x59800e0005f7531c */ + /*08b0*/ FFMA R30, R81.reuse, R95.reuse, R30; /* 0x59800f0005f7511e */ + /*08b8*/ FFMA R31, R81, R94.reuse, R31; /* 0x59800f8005e7511f */ + /* 0x081fc480fe2607f1 */ + /*08c8*/ FFMA R25, R82.reuse, R94.reuse, R25; /* 0x59800c8005e75219 */ + /*08d0*/ FFMA R24, R82, R95.reuse, R24; /* 0x59800c0005f75218 */ + /*08d8*/ FFMA R26, R80.reuse, R95, R26; /* 0x59800d0005f7501a */ + /* 0x001fc4c1fe0007f1 */ + /*08e8*/ FFMA R27, R80, R94, R27; /* 0x59800d8005e7501b */ + /*08f0*/ { FFMA R1, R66.reuse, R72.reuse, R1; /* 0x5980008004874201 */ + /*08f8*/ LDS.U.128 R80, [R114+0x300]; } /* 0xef4e100030077250 */ + /* 0x001fc4c0fe0407f1 */ + /*0908*/ FFMA R0, R66, R73.reuse, R0; /* 0x5980000004974200 */ + /*0910*/ { FFMA R2, R64.reuse, R73.reuse, R2; /* 0x5980010004974002 */ + /*0918*/ LDS.U.128 R88, [R115+0x300]; } /* 0xef4e100030077358 */ + /* 0x001fc4c0fe0407f1 */ + /*0928*/ FFMA R3, R64, R72.reuse, R3; /* 0x5980018004874003 */ + /*0930*/ { FFMA R5, R67.reuse, R72.reuse, R5; /* 0x5980028004874305 */ + /*0938*/ LDS.U.128 R84, [R114+0x380]; } /* 0xef4e100038077254 */ + /* 0x001c44c0fe0407f1 */ + /*0948*/ FFMA R4, R67, R73.reuse, R4; /* 0x5980020004974304 */ + /*0950*/ { FFMA R6, R65.reuse, R73.reuse, R6; /* 0x5980030004974106 */ + /*0958*/ LDS.U.128 R92, [R115+0x380]; } /* 0xef4e10003807735c */ + /* 0x101fc4c0fe2407f1 */ + /*0968*/ FFMA R7, R65, R72.reuse, R7; /* 0x5980038004874107 */ + /*0970*/ FFMA R33, R70.reuse, R72.reuse, R33; /* 0x5980108004874621 */ + /*0978*/ FFMA R32, R70, R73.reuse, R32; /* 0x5980100004974620 */ + /* 0x181fc480fe2607f1 */ + /*0988*/ FFMA R34, R68.reuse, R73.reuse, R34; /* 0x5980110004974422 */ + /*0990*/ FFMA R35, R68, R72.reuse, R35; /* 0x5980118004874423 */ + /*0998*/ FFMA R37, R71.reuse, R72.reuse, R37; /* 0x5980128004874725 */ + /* 0x081fc440fe2607f1 */ + /*09a8*/ FFMA R36, R71.reuse, R73.reuse, R36; /* 0x5980120004974724 */ + /*09b0*/ FFMA R38, R69.reuse, R73, R38; /* 0x5980130004974526 */ + /*09b8*/ FFMA R39, R69.reuse, R72, R39; /* 0x5980138004874527 */ + /* 0x181fc480fe2607f1 */ + /*09c8*/ FFMA R45, R71.reuse, R74.reuse, R45; /* 0x5980168004a7472d */ + /*09d0*/ FFMA R44, R71, R75.reuse, R44; /* 0x5980160004b7472c */ + /*09d8*/ FFMA R46, R69.reuse, R75.reuse, R46; /* 0x5980170004b7452e */ + /* 0x101fc4c0fe2407f1 */ + /*09e8*/ FFMA R47, R69, R74.reuse, R47; /* 0x5980178004a7452f */ + /*09f0*/ FFMA R41, R70.reuse, R74.reuse, R41; /* 0x5980148004a74629 */ + /*09f8*/ FFMA R40, R70, R75.reuse, R40; /* 0x5980140004b74628 */ + /* 0x181fc480fe2607f1 */ + /*0a08*/ FFMA R42, R68.reuse, R75.reuse, R42; /* 0x5980150004b7442a */ + /*0a10*/ FFMA R43, R68, R74.reuse, R43; /* 0x5980158004a7442b */ + /*0a18*/ FFMA R13, R67.reuse, R74.reuse, R13; /* 0x5980068004a7430d */ + /* 0x101fc4c0fe2407f1 */ + /*0a28*/ FFMA R12, R67, R75.reuse, R12; /* 0x5980060004b7430c */ + /*0a30*/ FFMA R14, R65.reuse, R75.reuse, R14; /* 0x5980070004b7410e */ + /*0a38*/ FFMA R15, R65, R74.reuse, R15; /* 0x5980078004a7410f */ + /* 0x081fc4c0fe2607f1 */ + /*0a48*/ FFMA R9, R66.reuse, R74.reuse, R9; /* 0x5980048004a74209 */ + /*0a50*/ FFMA R8, R66.reuse, R75.reuse, R8; /* 0x5980040004b74208 */ + /*0a58*/ FFMA R10, R64.reuse, R75, R10; /* 0x5980050004b7400a */ + /* 0x101fc4c0fc2207f1 */ + /*0a68*/ FFMA R11, R64.reuse, R74, R11; /* 0x5980058004a7400b */ + /*0a70*/ FFMA R17, R66.reuse, R76.reuse, R17; /* 0x5980088004c74211 */ + /*0a78*/ FFMA R16, R66, R77.reuse, R16; /* 0x5980080004d74210 */ + /* 0x181fc480fe2607f1 */ + /*0a88*/ FFMA R18, R64.reuse, R77.reuse, R18; /* 0x5980090004d74012 */ + /*0a90*/ FFMA R19, R64, R76.reuse, R19; /* 0x5980098004c74013 */ + /*0a98*/ FFMA R21, R67.reuse, R76.reuse, R21; /* 0x59800a8004c74315 */ + /* 0x101fc4c0fe2407f1 */ + /*0aa8*/ FFMA R20, R67, R77.reuse, R20; /* 0x59800a0004d74314 */ + /*0ab0*/ FFMA R22, R65.reuse, R77.reuse, R22; /* 0x59800b0004d74116 */ + /*0ab8*/ FFMA R23, R65, R76.reuse, R23; /* 0x59800b8004c74117 */ + /* 0x181fc480fe2607f1 */ + /*0ac8*/ FFMA R49, R70.reuse, R76.reuse, R49; /* 0x5980188004c74631 */ + /*0ad0*/ FFMA R48, R70, R77.reuse, R48; /* 0x5980180004d74630 */ + /*0ad8*/ FFMA R50, R68.reuse, R77.reuse, R50; /* 0x5980190004d74432 */ + /* 0x181fc4c0fe2407f1 */ + /*0ae8*/ FFMA R51, R68, R76.reuse, R51; /* 0x5980198004c74433 */ + /*0af0*/ FFMA R53, R71.reuse, R76.reuse, R53; /* 0x59801a8004c74735 */ + /*0af8*/ FFMA R52, R71.reuse, R77.reuse, R52; /* 0x59801a0004d74734 */ + /* 0x181fc440fe2207f1 */ + /*0b08*/ FFMA R54, R69.reuse, R77, R54; /* 0x59801b0004d74536 */ + /*0b10*/ FFMA R55, R69.reuse, R76, R55; /* 0x59801b8004c74537 */ + /*0b18*/ FFMA R61, R71.reuse, R78.reuse, R61; /* 0x59801e8004e7473d */ + /* 0x101fc4c0fe2407f1 */ + /*0b28*/ FFMA R60, R71, R79.reuse, R60; /* 0x59801e0004f7473c */ + /*0b30*/ FFMA R62, R69.reuse, R79.reuse, R62; /* 0x59801f0004f7453e */ + /*0b38*/ FFMA R63, R69, R78.reuse, R63; /* 0x59801f8004e7453f */ + /* 0x181fc480fe2607f1 */ + /*0b48*/ FFMA R57, R70.reuse, R78.reuse, R57; /* 0x59801c8004e74639 */ + /*0b50*/ FFMA R56, R70, R79.reuse, R56; /* 0x59801c0004f74638 */ + /*0b58*/ FFMA R58, R68.reuse, R79.reuse, R58; /* 0x59801d0004f7443a */ + /* 0x101fc4c0fe2407f1 */ + /*0b68*/ FFMA R59, R68, R78.reuse, R59; /* 0x59801d8004e7443b */ + /*0b70*/ FFMA R29, R67.reuse, R78.reuse, R29; /* 0x59800e8004e7431d */ + /*0b78*/ FFMA R28, R67, R79.reuse, R28; /* 0x59800e0004f7431c */ + /* 0x181fc480fe2607f1 */ + /*0b88*/ FFMA R30, R65.reuse, R79.reuse, R30; /* 0x59800f0004f7411e */ + /*0b90*/ FFMA R31, R65, R78.reuse, R31; /* 0x59800f8004e7411f */ + /*0b98*/ FFMA R25, R66.reuse, R78.reuse, R25; /* 0x59800c8004e74219 */ + /* 0x001fc440fe2407f1 */ + /*0ba8*/ FFMA R24, R66, R79.reuse, R24; /* 0x59800c0004f74218 */ + /*0bb0*/ FFMA R26, R64.reuse, R79, R26; /* 0x59800d0004f7401a */ + /*0bb8*/ FFMA R27, R64, R78, R27; /* 0x59800d8004e7401b */ + /* 0x101fc400fe260ff0 */ + /*0bc8*/ { FFMA R1, R82.reuse, R88.reuse, R1; /* 0x5980008005875201 */ + /*0bd0*/ LDS.U.128 R64, [R114+0x400]; } /* 0xef4e100040077240 */ + /*0bd8*/ FFMA R0, R82, R89.reuse, R0; /* 0x5980000005975200 */ + /* 0x101fc400fe2607f0 */ + /*0be8*/ { FFMA R2, R80.reuse, R89.reuse, R2; /* 0x5980010005975002 */ + /*0bf0*/ LDS.U.128 R72, [R115+0x400]; } /* 0xef4e100040077348 */ + /*0bf8*/ FFMA R3, R80, R88.reuse, R3; /* 0x5980018005875003 */ + /* 0x101fc400fe2607f0 */ + /*0c08*/ { FFMA R5, R83.reuse, R88.reuse, R5; /* 0x5980028005875305 */ + /*0c10*/ LDS.U.128 R68, [R114+0x480]; } /* 0xef4e100048077244 */ + /*0c18*/ FFMA R4, R83, R89.reuse, R4; /* 0x5980020005975304 */ + /* 0x101fc400e22607f0 */ + /*0c28*/ { FFMA R6, R81.reuse, R89.reuse, R6; /* 0x5980030005975106 */ + /*0c30*/ LDS.U.128 R76, [R115+0x480]; } /* 0xef4e10004807734c */ + /*0c38*/ FFMA R7, R81, R88.reuse, R7; /* 0x5980038005875107 */ + /* 0x181fc480fe2607f1 */ + /*0c48*/ FFMA R33, R86.reuse, R88.reuse, R33; /* 0x5980108005875621 */ + /*0c50*/ FFMA R32, R86, R89.reuse, R32; /* 0x5980100005975620 */ + /*0c58*/ FFMA R34, R84.reuse, R89.reuse, R34; /* 0x5980110005975422 */ + /* 0x181fc4c0fe2407f1 */ + /*0c68*/ FFMA R35, R84, R88.reuse, R35; /* 0x5980118005875423 */ + /*0c70*/ FFMA R37, R87.reuse, R88.reuse, R37; /* 0x5980128005875725 */ + /*0c78*/ FFMA R36, R87.reuse, R89.reuse, R36; /* 0x5980120005975724 */ + /* 0x181fc440fe2207f1 */ + /*0c88*/ FFMA R38, R85.reuse, R89, R38; /* 0x5980130005975526 */ + /*0c90*/ FFMA R39, R85.reuse, R88, R39; /* 0x5980138005875527 */ + /*0c98*/ FFMA R45, R87.reuse, R90.reuse, R45; /* 0x5980168005a7572d */ + /* 0x101fc4c0fe2407f1 */ + /*0ca8*/ FFMA R44, R87, R91.reuse, R44; /* 0x5980160005b7572c */ + /*0cb0*/ FFMA R46, R85.reuse, R91.reuse, R46; /* 0x5980170005b7552e */ + /*0cb8*/ FFMA R47, R85, R90.reuse, R47; /* 0x5980178005a7552f */ + /* 0x181fc480fe2607f1 */ + /*0cc8*/ FFMA R41, R86.reuse, R90.reuse, R41; /* 0x5980148005a75629 */ + /*0cd0*/ FFMA R40, R86, R91.reuse, R40; /* 0x5980140005b75628 */ + /*0cd8*/ FFMA R42, R84.reuse, R91.reuse, R42; /* 0x5980150005b7542a */ + /* 0x101fc4c0fe2407f1 */ + /*0ce8*/ FFMA R43, R84, R90.reuse, R43; /* 0x5980158005a7542b */ + /*0cf0*/ FFMA R13, R83.reuse, R90.reuse, R13; /* 0x5980068005a7530d */ + /*0cf8*/ FFMA R12, R83, R91.reuse, R12; /* 0x5980060005b7530c */ + /* 0x181fc480fe2607f1 */ + /*0d08*/ FFMA R14, R81.reuse, R91.reuse, R14; /* 0x5980070005b7510e */ + /*0d10*/ FFMA R15, R81, R90.reuse, R15; /* 0x5980078005a7510f */ + /*0d18*/ FFMA R9, R82.reuse, R90.reuse, R9; /* 0x5980048005a75209 */ + /* 0x081fc440fe2607f1 */ + /*0d28*/ FFMA R8, R82.reuse, R91.reuse, R8; /* 0x5980040005b75208 */ + /*0d30*/ FFMA R10, R80.reuse, R91, R10; /* 0x5980050005b7500a */ + /*0d38*/ FFMA R11, R80.reuse, R90, R11; /* 0x5980058005a7500b */ + /* 0x181fc480fe2607e1 */ + /*0d48*/ FFMA R17, R82.reuse, R92.reuse, R17; /* 0x5980088005c75211 */ + /*0d50*/ FFMA R16, R82, R93.reuse, R16; /* 0x5980080005d75210 */ + /*0d58*/ FFMA R18, R80.reuse, R93.reuse, R18; /* 0x5980090005d75012 */ + /* 0x101fc4c0fe2407f1 */ + /*0d68*/ FFMA R19, R80, R92.reuse, R19; /* 0x5980098005c75013 */ + /*0d70*/ FFMA R21, R83.reuse, R92.reuse, R21; /* 0x59800a8005c75315 */ + /*0d78*/ FFMA R20, R83, R93.reuse, R20; /* 0x59800a0005d75314 */ + /* 0x181fc480fe2607f1 */ + /*0d88*/ FFMA R22, R81.reuse, R93.reuse, R22; /* 0x59800b0005d75116 */ + /*0d90*/ FFMA R23, R81, R92.reuse, R23; /* 0x59800b8005c75117 */ + /*0d98*/ FFMA R49, R86.reuse, R92.reuse, R49; /* 0x5980188005c75631 */ + /* 0x101fc4c0fe2407f1 */ + /*0da8*/ FFMA R48, R86, R93.reuse, R48; /* 0x5980180005d75630 */ + /*0db0*/ FFMA R50, R84.reuse, R93.reuse, R50; /* 0x5980190005d75432 */ + /*0db8*/ FFMA R51, R84, R92.reuse, R51; /* 0x5980198005c75433 */ + /* 0x081fc4c0fe2607f1 */ + /*0dc8*/ FFMA R53, R87.reuse, R92.reuse, R53; /* 0x59801a8005c75735 */ + /*0dd0*/ FFMA R52, R87.reuse, R93.reuse, R52; /* 0x59801a0005d75734 */ + /*0dd8*/ FFMA R54, R85.reuse, R93, R54; /* 0x59801b0005d75536 */ + /* 0x101fc4c0fe2207f1 */ + /*0de8*/ FFMA R55, R85.reuse, R92, R55; /* 0x59801b8005c75537 */ + /*0df0*/ FFMA R61, R87.reuse, R94.reuse, R61; /* 0x59801e8005e7573d */ + /*0df8*/ FFMA R60, R87, R95.reuse, R60; /* 0x59801e0005f7573c */ + /* 0x181fc480fe2607f1 */ + /*0e08*/ FFMA R62, R85.reuse, R95.reuse, R62; /* 0x59801f0005f7553e */ + /*0e10*/ FFMA R63, R85, R94.reuse, R63; /* 0x59801f8005e7553f */ + /*0e18*/ FFMA R57, R86.reuse, R94.reuse, R57; /* 0x59801c8005e75639 */ + /* 0x101fc4c0fe2407f1 */ + /*0e28*/ FFMA R56, R86, R95.reuse, R56; /* 0x59801c0005f75638 */ + /*0e30*/ FFMA R58, R84.reuse, R95.reuse, R58; /* 0x59801d0005f7543a */ + /*0e38*/ FFMA R59, R84, R94.reuse, R59; /* 0x59801d8005e7543b */ + /* 0x181fc480fe2607f1 */ + /*0e48*/ FFMA R29, R83.reuse, R94.reuse, R29; /* 0x59800e8005e7531d */ + /*0e50*/ FFMA R28, R83, R95.reuse, R28; /* 0x59800e0005f7531c */ + /*0e58*/ FFMA R30, R81.reuse, R95.reuse, R30; /* 0x59800f0005f7511e */ + /* 0x101fc4c0fe2407f1 */ + /*0e68*/ FFMA R31, R81, R94.reuse, R31; /* 0x59800f8005e7511f */ + /*0e70*/ FFMA R25, R82.reuse, R94.reuse, R25; /* 0x59800c8005e75219 */ + /*0e78*/ FFMA R24, R82, R95.reuse, R24; /* 0x59800c0005f75218 */ + /* 0x183fc000fe2207f1 */ + /*0e88*/ FFMA R26, R80.reuse, R95, R26; /* 0x59800d0005f7501a */ + /*0e90*/ FFMA R27, R80, R94, R27; /* 0x59800d8005e7501b */ + /*0e98*/ { FFMA R1, R66.reuse, R72.reuse, R1; /* 0x5980008004874201 */ + /*0ea8*/ LDS.U.128 R80, [R114+0x500]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100050077250 */ + /*0eb0*/ FFMA R0, R66, R73.reuse, R0; /* 0x5980000004974200 */ + /*0eb8*/ { FFMA R2, R64.reuse, R73.reuse, R2; /* 0x5980010004974002 */ + /*0ec8*/ LDS.U.128 R88, [R115+0x500]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100050077358 */ + /*0ed0*/ FFMA R3, R64, R72.reuse, R3; /* 0x5980018004874003 */ + /*0ed8*/ { FFMA R5, R67.reuse, R72.reuse, R5; /* 0x5980028004874305 */ + /*0ee8*/ LDS.U.128 R84, [R114+0x580]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100058077254 */ + /*0ef0*/ FFMA R4, R67, R73.reuse, R4; /* 0x5980020004974304 */ + /*0ef8*/ { FFMA R6, R65.reuse, R73.reuse, R6; /* 0x5980030004974106 */ + /*0f08*/ LDS.U.128 R92, [R115+0x580]; } /* 0x181fc480fe200711 */ + /* 0xef4e10005807735c */ + /*0f10*/ FFMA R7, R65, R72.reuse, R7; /* 0x5980038004874107 */ + /*0f18*/ FFMA R33, R70.reuse, R72.reuse, R33; /* 0x5980108004874621 */ + /* 0x101fc4c0fe2407f1 */ + /*0f28*/ FFMA R32, R70, R73.reuse, R32; /* 0x5980100004974620 */ + /*0f30*/ FFMA R34, R68.reuse, R73.reuse, R34; /* 0x5980110004974422 */ + /*0f38*/ FFMA R35, R68, R72.reuse, R35; /* 0x5980118004874423 */ + /* 0x081fc4c0fe2607f1 */ + /*0f48*/ FFMA R37, R71.reuse, R72.reuse, R37; /* 0x5980128004874725 */ + /*0f50*/ FFMA R36, R71.reuse, R73.reuse, R36; /* 0x5980120004974724 */ + /*0f58*/ FFMA R38, R69.reuse, R73, R38; /* 0x5980130004974526 */ + /* 0x101fc4c0fe2207f1 */ + /*0f68*/ FFMA R39, R69.reuse, R72, R39; /* 0x5980138004874527 */ + /*0f70*/ FFMA R45, R71.reuse, R74.reuse, R45; /* 0x5980168004a7472d */ + /*0f78*/ FFMA R44, R71, R75.reuse, R44; /* 0x5980160004b7472c */ + /* 0x181fc480fe2607f1 */ + /*0f88*/ FFMA R46, R69.reuse, R75.reuse, R46; /* 0x5980170004b7452e */ + /*0f90*/ FFMA R47, R69, R74.reuse, R47; /* 0x5980178004a7452f */ + /*0f98*/ FFMA R41, R70.reuse, R74.reuse, R41; /* 0x5980148004a74629 */ + /* 0x101fc4c0fe2407f1 */ + /*0fa8*/ FFMA R40, R70, R75.reuse, R40; /* 0x5980140004b74628 */ + /*0fb0*/ FFMA R42, R68.reuse, R75.reuse, R42; /* 0x5980150004b7442a */ + /*0fb8*/ FFMA R43, R68, R74.reuse, R43; /* 0x5980158004a7442b */ + /* 0x181fc480fe2607f1 */ + /*0fc8*/ FFMA R13, R67.reuse, R74.reuse, R13; /* 0x5980068004a7430d */ + /*0fd0*/ FFMA R12, R67, R75.reuse, R12; /* 0x5980060004b7430c */ + /*0fd8*/ FFMA R14, R65.reuse, R75.reuse, R14; /* 0x5980070004b7410e */ + /* 0x181fc4c0fe2407f1 */ + /*0fe8*/ FFMA R15, R65, R74.reuse, R15; /* 0x5980078004a7410f */ + /*0ff0*/ FFMA R9, R66.reuse, R74.reuse, R9; /* 0x5980048004a74209 */ + /*0ff8*/ FFMA R8, R66.reuse, R75.reuse, R8; /* 0x5980040004b74208 */ + /* 0x181f8440fe2207f1 */ + /*1008*/ FFMA R10, R64.reuse, R75, R10; /* 0x5980050004b7400a */ + /*1010*/ FFMA R11, R64.reuse, R74, R11; /* 0x5980058004a7400b */ + /*1018*/ FFMA R17, R66.reuse, R76.reuse, R17; /* 0x5980088004c74211 */ + /* 0x101fc4c0fe2407f1 */ + /*1028*/ FFMA R16, R66, R77.reuse, R16; /* 0x5980080004d74210 */ + /*1030*/ FFMA R18, R64.reuse, R77.reuse, R18; /* 0x5980090004d74012 */ + /*1038*/ FFMA R19, R64, R76.reuse, R19; /* 0x5980098004c74013 */ + /* 0x181fc480fe2607f1 */ + /*1048*/ FFMA R21, R67.reuse, R76.reuse, R21; /* 0x59800a8004c74315 */ + /*1050*/ FFMA R20, R67, R77.reuse, R20; /* 0x59800a0004d74314 */ + /*1058*/ FFMA R22, R65.reuse, R77.reuse, R22; /* 0x59800b0004d74116 */ + /* 0x101fc4c0fe2407f1 */ + /*1068*/ FFMA R23, R65, R76.reuse, R23; /* 0x59800b8004c74117 */ + /*1070*/ FFMA R49, R70.reuse, R76.reuse, R49; /* 0x5980188004c74631 */ + /*1078*/ FFMA R48, R70, R77.reuse, R48; /* 0x5980180004d74630 */ + /* 0x181fc480fe2607f1 */ + /*1088*/ FFMA R50, R68.reuse, R77.reuse, R50; /* 0x5980190004d74432 */ + /*1090*/ FFMA R51, R68, R76.reuse, R51; /* 0x5980198004c74433 */ + /*1098*/ FFMA R53, R71.reuse, R76.reuse, R53; /* 0x59801a8004c74735 */ + /* 0x081fc440fe2607f1 */ + /*10a8*/ FFMA R52, R71.reuse, R77.reuse, R52; /* 0x59801a0004d74734 */ + /*10b0*/ FFMA R54, R69.reuse, R77, R54; /* 0x59801b0004d74536 */ + /*10b8*/ FFMA R55, R69.reuse, R76, R55; /* 0x59801b8004c74537 */ + /* 0x181fc480fe2607f1 */ + /*10c8*/ FFMA R61, R71.reuse, R78.reuse, R61; /* 0x59801e8004e7473d */ + /*10d0*/ FFMA R60, R71, R79.reuse, R60; /* 0x59801e0004f7473c */ + /*10d8*/ FFMA R62, R69.reuse, R79.reuse, R62; /* 0x59801f0004f7453e */ + /* 0x101fc4c0fe2407f1 */ + /*10e8*/ FFMA R63, R69, R78.reuse, R63; /* 0x59801f8004e7453f */ + /*10f0*/ FFMA R57, R70.reuse, R78.reuse, R57; /* 0x59801c8004e74639 */ + /*10f8*/ FFMA R56, R70, R79.reuse, R56; /* 0x59801c0004f74638 */ + /* 0x181fc480fe2607f1 */ + /*1108*/ FFMA R58, R68.reuse, R79.reuse, R58; /* 0x59801d0004f7443a */ + /*1110*/ FFMA R59, R68, R78.reuse, R59; /* 0x59801d8004e7443b */ + /*1118*/ FFMA R29, R67.reuse, R78.reuse, R29; /* 0x59800e8004e7431d */ + /* 0x101fc4c0fe2407f1 */ + /*1128*/ FFMA R28, R67, R79.reuse, R28; /* 0x59800e0004f7431c */ + /*1130*/ FFMA R30, R65.reuse, R79.reuse, R30; /* 0x59800f0004f7411e */ + /*1138*/ FFMA R31, R65, R78.reuse, R31; /* 0x59800f8004e7411f */ + /* 0x081fc480fe2607f1 */ + /*1148*/ FFMA R25, R66.reuse, R78.reuse, R25; /* 0x59800c8004e74219 */ + /*1150*/ FFMA R24, R66, R79.reuse, R24; /* 0x59800c0004f74218 */ + /*1158*/ FFMA R26, R64.reuse, R79, R26; /* 0x59800d0004f7401a */ + /* 0x001fc4c1fe0007f1 */ + /*1168*/ FFMA R27, R64, R78, R27; /* 0x59800d8004e7401b */ + /*1170*/ { FFMA R1, R82.reuse, R88.reuse, R1; /* 0x5980008005875201 */ + /*1178*/ LDS.U.128 R64, [R114+0x600]; } /* 0xef4e100060077240 */ + /* 0x001fc4c0fe0407f1 */ + /*1188*/ FFMA R0, R82, R89.reuse, R0; /* 0x5980000005975200 */ + /*1190*/ { FFMA R2, R80.reuse, R89.reuse, R2; /* 0x5980010005975002 */ + /*1198*/ LDS.U.128 R72, [R115+0x600]; } /* 0xef4e100060077348 */ + /* 0x001fc4c0fe0407f1 */ + /*11a8*/ FFMA R3, R80, R88.reuse, R3; /* 0x5980018005875003 */ + /*11b0*/ { FFMA R5, R83.reuse, R88.reuse, R5; /* 0x5980028005875305 */ + /*11b8*/ LDS.U.128 R68, [R114+0x680]; } /* 0xef4e100068077244 */ + /* 0x001c44c0fe0407f1 */ + /*11c8*/ FFMA R4, R83, R89.reuse, R4; /* 0x5980020005975304 */ + /*11d0*/ { FFMA R6, R81.reuse, R89.reuse, R6; /* 0x5980030005975106 */ + /*11d8*/ LDS.U.128 R76, [R115+0x680]; } /* 0xef4e10006807734c */ + /* 0x101fc4c0fe2407f1 */ + /*11e8*/ FFMA R7, R81, R88.reuse, R7; /* 0x5980038005875107 */ + /*11f0*/ FFMA R33, R86.reuse, R88.reuse, R33; /* 0x5980108005875621 */ + /*11f8*/ FFMA R32, R86, R89.reuse, R32; /* 0x5980100005975620 */ + /* 0x181fc480fe2607f1 */ + /*1208*/ FFMA R34, R84.reuse, R89.reuse, R34; /* 0x5980110005975422 */ + /*1210*/ FFMA R35, R84, R88.reuse, R35; /* 0x5980118005875423 */ + /*1218*/ FFMA R37, R87.reuse, R88.reuse, R37; /* 0x5980128005875725 */ + /* 0x081fc440fe2607f1 */ + /*1228*/ FFMA R36, R87.reuse, R89.reuse, R36; /* 0x5980120005975724 */ + /*1230*/ FFMA R38, R85.reuse, R89, R38; /* 0x5980130005975526 */ + /*1238*/ FFMA R39, R85.reuse, R88, R39; /* 0x5980138005875527 */ + /* 0x181fc480fe2607f1 */ + /*1248*/ FFMA R45, R87.reuse, R90.reuse, R45; /* 0x5980168005a7572d */ + /*1250*/ FFMA R44, R87, R91.reuse, R44; /* 0x5980160005b7572c */ + /*1258*/ FFMA R46, R85.reuse, R91.reuse, R46; /* 0x5980170005b7552e */ + /* 0x101fc4c0fe2407f1 */ + /*1268*/ FFMA R47, R85, R90.reuse, R47; /* 0x5980178005a7552f */ + /*1270*/ FFMA R41, R86.reuse, R90.reuse, R41; /* 0x5980148005a75629 */ + /*1278*/ FFMA R40, R86, R91.reuse, R40; /* 0x5980140005b75628 */ + /* 0x181fc480fe2607f1 */ + /*1288*/ FFMA R42, R84.reuse, R91.reuse, R42; /* 0x5980150005b7542a */ + /*1290*/ FFMA R43, R84, R90.reuse, R43; /* 0x5980158005a7542b */ + /*1298*/ FFMA R13, R83.reuse, R90.reuse, R13; /* 0x5980068005a7530d */ + /* 0x101fc4c0fe2407f1 */ + /*12a8*/ FFMA R12, R83, R91.reuse, R12; /* 0x5980060005b7530c */ + /*12b0*/ FFMA R14, R81.reuse, R91.reuse, R14; /* 0x5980070005b7510e */ + /*12b8*/ FFMA R15, R81, R90.reuse, R15; /* 0x5980078005a7510f */ + /* 0x081fc0c0fe2607f1 */ + /*12c8*/ FFMA R9, R82.reuse, R90.reuse, R9; /* 0x5980048005a75209 */ + /*12d0*/ FFMA R8, R82.reuse, R91.reuse, R8; /* 0x5980040005b75208 */ + /*12d8*/ { FFMA R10, R80.reuse, R91, R10; /* 0x5980050005b7500a */ + /*12e8*/ @P0 STS.128 [R118], R96; } /* 0x181f8440fe2017f1 */ + /* 0xef5e000000007660 */ + /*12f0*/ FFMA R11, R80.reuse, R90, R11; /* 0x5980058005a7500b */ + /*12f8*/ FFMA R17, R82.reuse, R92.reuse, R17; /* 0x5980088005c75211 */ + /* 0x001fc4c0fe0407f1 */ + /*1308*/ FFMA R16, R82, R93.reuse, R16; /* 0x5980080005d75210 */ + /*1310*/ { FFMA R18, R80.reuse, R93.reuse, R18; /* 0x5980090005d75012 */ + /*1318*/ @P0 STS.128 [R118+0x200], R100; } /* 0xef5e000020007664 */ + /* 0x101fc4c0fe2407f1 */ + /*1328*/ FFMA R19, R80, R92.reuse, R19; /* 0x5980098005c75013 */ + /*1330*/ FFMA R21, R83.reuse, R92.reuse, R21; /* 0x59800a8005c75315 */ + /*1338*/ FFMA R20, R83, R93.reuse, R20; /* 0x59800a0005d75314 */ + /* 0x181fc480fe2607f1 */ + /*1348*/ FFMA R22, R81.reuse, R93.reuse, R22; /* 0x59800b0005d75116 */ + /*1350*/ FFMA R23, R81, R92.reuse, R23; /* 0x59800b8005c75117 */ + /*1358*/ FFMA R49, R86.reuse, R92.reuse, R49; /* 0x5980188005c75631 */ + /* 0x101fc4c0fe2407f1 */ + /*1368*/ FFMA R48, R86, R93.reuse, R48; /* 0x5980180005d75630 */ + /*1370*/ FFMA R50, R84.reuse, R93.reuse, R50; /* 0x5980190005d75432 */ + /*1378*/ FFMA R51, R84, R92.reuse, R51; /* 0x5980198005c75433 */ + /* 0x081fc4c0fe2607f1 */ + /*1388*/ FFMA R53, R87.reuse, R92.reuse, R53; /* 0x59801a8005c75735 */ + /*1390*/ FFMA R52, R87.reuse, R93.reuse, R52; /* 0x59801a0005d75734 */ + /*1398*/ FFMA R54, R85.reuse, R93, R54; /* 0x59801b0005d75536 */ + /* 0x101fc4c0fe2207f1 */ + /*13a8*/ FFMA R55, R85.reuse, R92, R55; /* 0x59801b8005c75537 */ + /*13b0*/ FFMA R61, R87.reuse, R94.reuse, R61; /* 0x59801e8005e7573d */ + /*13b8*/ FFMA R60, R87, R95.reuse, R60; /* 0x59801e0005f7573c */ + /* 0x181fc480fe2607f1 */ + /*13c8*/ FFMA R62, R85.reuse, R95.reuse, R62; /* 0x59801f0005f7553e */ + /*13d0*/ FFMA R63, R85, R94.reuse, R63; /* 0x59801f8005e7553f */ + /*13d8*/ FFMA R57, R86.reuse, R94.reuse, R57; /* 0x59801c8005e75639 */ + /* 0x101fc4c0fe2407f1 */ + /*13e8*/ FFMA R56, R86, R95.reuse, R56; /* 0x59801c0005f75638 */ + /*13f0*/ FFMA R58, R84.reuse, R95.reuse, R58; /* 0x59801d0005f7543a */ + /*13f8*/ FFMA R59, R84, R94.reuse, R59; /* 0x59801d8005e7543b */ + /* 0x181fc480fe2607f1 */ + /*1408*/ FFMA R29, R83.reuse, R94.reuse, R29; /* 0x59800e8005e7531d */ + /*1410*/ FFMA R28, R83, R95.reuse, R28; /* 0x59800e0005f7531c */ + /*1418*/ FFMA R30, R81.reuse, R95.reuse, R30; /* 0x59800f0005f7511e */ + /* 0x101fc4c0fe2407f1 */ + /*1428*/ FFMA R31, R81, R94.reuse, R31; /* 0x59800f8005e7511f */ + /*1430*/ FFMA R25, R82.reuse, R94.reuse, R25; /* 0x59800c8005e75219 */ + /*1438*/ FFMA R24, R82, R95.reuse, R24; /* 0x59800c0005f75218 */ + /* 0x183fc000fe2207f1 */ + /*1448*/ FFMA R26, R80.reuse, R95, R26; /* 0x59800d0005f7501a */ + /*1450*/ FFMA R27, R80, R94, R27; /* 0x59800d8005e7501b */ + /*1458*/ { FFMA R1, R66.reuse, R72.reuse, R1; /* 0x5980008004874201 */ + /*1468*/ LDS.U.128 R80, [R114+0x700]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100070077250 */ + /*1470*/ FFMA R0, R66, R73.reuse, R0; /* 0x5980000004974200 */ + /*1478*/ { FFMA R2, R64.reuse, R73.reuse, R2; /* 0x5980010004974002 */ + /*1488*/ LDS.U.128 R88, [R115+0x700]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100070077358 */ + /*1490*/ FFMA R3, R64, R72.reuse, R3; /* 0x5980018004874003 */ + /*1498*/ { FFMA R5, R67.reuse, R72.reuse, R5; /* 0x5980028004874305 */ + /*14a8*/ LDS.U.128 R84, [R114+0x780]; } /* 0x181fc080fe2007f1 */ + /* 0xef4e100078077254 */ + /*14b0*/ FFMA R4, R67, R73.reuse, R4; /* 0x5980020004974304 */ + /*14b8*/ { FFMA R6, R65.reuse, R73.reuse, R6; /* 0x5980030004974106 */ + /*14c8*/ LDS.U.128 R92, [R115+0x780]; } /* 0x181fc480fe200711 */ + /* 0xef4e10007807735c */ + /*14d0*/ FFMA R7, R65, R72.reuse, R7; /* 0x5980038004874107 */ + /*14d8*/ FFMA R33, R70.reuse, R72.reuse, R33; /* 0x5980108004874621 */ + /* 0x101fc4c0fe2407f1 */ + /*14e8*/ FFMA R32, R70, R73.reuse, R32; /* 0x5980100004974620 */ + /*14f0*/ FFMA R34, R68.reuse, R73.reuse, R34; /* 0x5980110004974422 */ + /*14f8*/ FFMA R35, R68, R72.reuse, R35; /* 0x5980118004874423 */ + /* 0x081fc4c0fe2607f1 */ + /*1508*/ FFMA R37, R71.reuse, R72.reuse, R37; /* 0x5980128004874725 */ + /*1510*/ FFMA R36, R71.reuse, R73.reuse, R36; /* 0x5980120004974724 */ + /*1518*/ FFMA R38, R69.reuse, R73, R38; /* 0x5980130004974526 */ + /* 0x101fc4c0fe2207f1 */ + /*1528*/ FFMA R39, R69.reuse, R72, R39; /* 0x5980138004874527 */ + /*1530*/ FFMA R45, R71.reuse, R74.reuse, R45; /* 0x5980168004a7472d */ + /*1538*/ FFMA R44, R71, R75.reuse, R44; /* 0x5980160004b7472c */ + /* 0x181fc480fe2607f1 */ + /*1548*/ FFMA R46, R69.reuse, R75.reuse, R46; /* 0x5980170004b7452e */ + /*1550*/ FFMA R47, R69, R74.reuse, R47; /* 0x5980178004a7452f */ + /*1558*/ FFMA R41, R70.reuse, R74.reuse, R41; /* 0x5980148004a74629 */ + /* 0x101fc4c0fe2407f1 */ + /*1568*/ FFMA R40, R70, R75.reuse, R40; /* 0x5980140004b74628 */ + /*1570*/ FFMA R42, R68.reuse, R75.reuse, R42; /* 0x5980150004b7442a */ + /*1578*/ FFMA R43, R68, R74.reuse, R43; /* 0x5980158004a7442b */ + /* 0x181fc480fe2607f1 */ + /*1588*/ FFMA R13, R67.reuse, R74.reuse, R13; /* 0x5980068004a7430d */ + /*1590*/ FFMA R12, R67, R75.reuse, R12; /* 0x5980060004b7430c */ + /*1598*/ FFMA R14, R65.reuse, R75.reuse, R14; /* 0x5980070004b7410e */ + /* 0x181fc4c0fe2407f1 */ + /*15a8*/ FFMA R15, R65, R74.reuse, R15; /* 0x5980078004a7410f */ + /*15b0*/ FFMA R9, R66.reuse, R74.reuse, R9; /* 0x5980048004a74209 */ + /*15b8*/ FFMA R8, R66.reuse, R75.reuse, R8; /* 0x5980040004b74208 */ + /* 0x081fc404fe2207f0 */ + /*15c8*/ { FFMA R10, R64.reuse, R75, R10; /* 0x5980050004b7400a */ + /*15d0*/ @P0 STS.128 [R118+0x400], R104; } /* 0xef5e000040007668 */ + /*15d8*/ FFMA R11, R64.reuse, R74, R11; /* 0x5980058004a7400b */ + /* 0x181fc080fe2607e1 */ + /*15e8*/ FFMA R17, R66.reuse, R76.reuse, R17; /* 0x5980088004c74211 */ + /*15f0*/ FFMA R16, R66, R77.reuse, R16; /* 0x5980080004d74210 */ + /*15f8*/ { FFMA R18, R64.reuse, R77.reuse, R18; /* 0x5980090004d74012 */ + /*1608*/ @P0 STS.128 [R118+0x600], R108; } /* 0x181fc480fe2007f1 */ + /* 0xef5e00006000766c */ + /*1610*/ FFMA R19, R64, R76.reuse, R19; /* 0x5980098004c74013 */ + /*1618*/ FFMA R21, R67.reuse, R76.reuse, R21; /* 0x59800a8004c74315 */ + /* 0x101fc4c0fe2407f1 */ + /*1628*/ FFMA R20, R67, R77.reuse, R20; /* 0x59800a0004d74314 */ + /*1630*/ FFMA R22, R65.reuse, R77.reuse, R22; /* 0x59800b0004d74116 */ + /*1638*/ FFMA R23, R65, R76.reuse, R23; /* 0x59800b8004c74117 */ + /* 0x181fc480fe2607f1 */ + /*1648*/ FFMA R49, R70.reuse, R76.reuse, R49; /* 0x5980188004c74631 */ + /*1650*/ FFMA R48, R70, R77.reuse, R48; /* 0x5980180004d74630 */ + /*1658*/ FFMA R50, R68.reuse, R77.reuse, R50; /* 0x5980190004d74432 */ + /* 0x181fc4c0fe2407f1 */ + /*1668*/ FFMA R51, R68, R76.reuse, R51; /* 0x5980198004c74433 */ + /*1670*/ FFMA R53, R71.reuse, R76.reuse, R53; /* 0x59801a8004c74735 */ + /*1678*/ FFMA R52, R71.reuse, R77.reuse, R52; /* 0x59801a0004d74734 */ + /* 0x181fc440fe2207f1 */ + /*1688*/ FFMA R54, R69.reuse, R77, R54; /* 0x59801b0004d74536 */ + /*1690*/ FFMA R55, R69.reuse, R76, R55; /* 0x59801b8004c74537 */ + /*1698*/ FFMA R61, R71.reuse, R78.reuse, R61; /* 0x59801e8004e7473d */ + /* 0x101fc4c0fe2407f1 */ + /*16a8*/ FFMA R60, R71, R79.reuse, R60; /* 0x59801e0004f7473c */ + /*16b0*/ FFMA R62, R69.reuse, R79.reuse, R62; /* 0x59801f0004f7453e */ + /*16b8*/ FFMA R63, R69, R78.reuse, R63; /* 0x59801f8004e7453f */ + /* 0x181fc480fe2607f1 */ + /*16c8*/ FFMA R57, R70.reuse, R78.reuse, R57; /* 0x59801c8004e74639 */ + /*16d0*/ FFMA R56, R70, R79.reuse, R56; /* 0x59801c0004f74638 */ + /*16d8*/ FFMA R58, R68.reuse, R79.reuse, R58; /* 0x59801d0004f7443a */ + /* 0x101fc4c0fe2407f1 */ + /*16e8*/ FFMA R59, R68, R78.reuse, R59; /* 0x59801d8004e7443b */ + /*16f0*/ FFMA R29, R67.reuse, R78.reuse, R29; /* 0x59800e8004e7431d */ + /*16f8*/ FFMA R28, R67, R79.reuse, R28; /* 0x59800e0004f7431c */ + /* 0x181fc480fe2607f1 */ + /*1708*/ FFMA R30, R65.reuse, R79.reuse, R30; /* 0x59800f0004f7411e */ + /*1710*/ FFMA R31, R65, R78.reuse, R31; /* 0x59800f8004e7411f */ + /*1718*/ FFMA R25, R66.reuse, R78.reuse, R25; /* 0x59800c8004e74219 */ + /* 0x003fd440fe0407f1 */ + /*1728*/ FFMA R24, R66, R79.reuse, R24; /* 0x59800c0004f74218 */ + /*1730*/ { FFMA R26, R64.reuse, R79, R26; /* 0x59800d0004f7401a */ + /*1738*/ BAR.SYNC 0x0; } /* 0xf0a81b8000070000 */ + /* 0x001fc400fe2007f1 */ + /*1748*/ @P0 LOP.XOR R114, R114, 0x1000; /* 0x3847040100007272 */ + /*1750*/ @P0 LOP.XOR R115, R115, 0x1000; /* 0x3847040100007373 */ + /*1758*/ @P0 LOP.XOR R118, R118, 0x1000; /* 0x3847040100007676 */ + /* 0x001fc4c0fe0007f1 */ + /*1768*/ FFMA R27, R64, R78, R27; /* 0x59800d8004e7401b */ + /*1770*/ { FFMA R1, R82.reuse, R88.reuse, R1; /* 0x5980008005875201 */ + /*1778*/ @P0 LDS.U.128 R64, [R114]; } /* 0xef4e100000007240 */ + /* 0x001fc4c0fe0407f1 */ + /*1788*/ FFMA R0, R82, R89.reuse, R0; /* 0x5980000005975200 */ + /*1790*/ { FFMA R2, R80.reuse, R89.reuse, R2; /* 0x5980010005975002 */ + /*1798*/ @P0 LDS.U.128 R72, [R115]; } /* 0xef4e100000007348 */ + /* 0x001fc4c0fe0407f1 */ + /*17a8*/ FFMA R3, R80, R88.reuse, R3; /* 0x5980018005875003 */ + /*17b0*/ { FFMA R5, R83.reuse, R88.reuse, R5; /* 0x5980028005875305 */ + /*17b8*/ @P0 LDS.U.128 R68, [R114+0x80]; } /* 0xef4e100008007244 */ + /* 0x001c44c0fe0407f1 */ + /*17c8*/ FFMA R4, R83, R89.reuse, R4; /* 0x5980020005975304 */ + /*17d0*/ { FFMA R6, R81.reuse, R89.reuse, R6; /* 0x5980030005975106 */ + /*17d8*/ @P0 LDS.U.128 R76, [R115+0x80]; } /* 0xef4e10000800734c */ + /* 0x101fc4c0fe2407f1 */ + /*17e8*/ FFMA R7, R81, R88.reuse, R7; /* 0x5980038005875107 */ + /*17f0*/ FFMA R33, R86.reuse, R88.reuse, R33; /* 0x5980108005875621 */ + /*17f8*/ FFMA R32, R86, R89.reuse, R32; /* 0x5980100005975620 */ + /* 0x181fc480fe2607f1 */ + /*1808*/ FFMA R34, R84.reuse, R89.reuse, R34; /* 0x5980110005975422 */ + /*1810*/ FFMA R35, R84, R88.reuse, R35; /* 0x5980118005875423 */ + /*1818*/ FFMA R37, R87.reuse, R88.reuse, R37; /* 0x5980128005875725 */ + /* 0x081fc440fe2607f1 */ + /*1828*/ FFMA R36, R87.reuse, R89.reuse, R36; /* 0x5980120005975724 */ + /*1830*/ FFMA R38, R85.reuse, R89, R38; /* 0x5980130005975526 */ + /*1838*/ FFMA R39, R85.reuse, R88, R39; /* 0x5980138005875527 */ + /* 0x181fc480fe2607f1 */ + /*1848*/ FFMA R45, R87.reuse, R90.reuse, R45; /* 0x5980168005a7572d */ + /*1850*/ FFMA R44, R87, R91.reuse, R44; /* 0x5980160005b7572c */ + /*1858*/ FFMA R46, R85.reuse, R91.reuse, R46; /* 0x5980170005b7552e */ + /* 0x101fc4c0fe2407f1 */ + /*1868*/ FFMA R47, R85, R90.reuse, R47; /* 0x5980178005a7552f */ + /*1870*/ FFMA R41, R86.reuse, R90.reuse, R41; /* 0x5980148005a75629 */ + /*1878*/ FFMA R40, R86, R91.reuse, R40; /* 0x5980140005b75628 */ + /* 0x181fc480fe2607f1 */ + /*1888*/ FFMA R42, R84.reuse, R91.reuse, R42; /* 0x5980150005b7542a */ + /*1890*/ FFMA R43, R84, R90.reuse, R43; /* 0x5980158005a7542b */ + /*1898*/ FFMA R13, R83.reuse, R90.reuse, R13; /* 0x5980068005a7530d */ + /* 0x101fc4c0fe2407f1 */ + /*18a8*/ FFMA R12, R83, R91.reuse, R12; /* 0x5980060005b7530c */ + /*18b0*/ FFMA R14, R81.reuse, R91.reuse, R14; /* 0x5980070005b7510e */ + /*18b8*/ FFMA R15, R81, R90.reuse, R15; /* 0x5980078005a7510f */ + /* 0x081fc4c0fe2607f1 */ + /*18c8*/ FFMA R9, R82.reuse, R90.reuse, R9; /* 0x5980048005a75209 */ + /*18d0*/ FFMA R8, R82.reuse, R91.reuse, R8; /* 0x5980040005b75208 */ + /*18d8*/ FFMA R10, R80.reuse, R91, R10; /* 0x5980050005b7500a */ + /* 0x101fc4c0fc2207f1 */ + /*18e8*/ FFMA R11, R80.reuse, R90, R11; /* 0x5980058005a7500b */ + /*18f0*/ FFMA R17, R82.reuse, R92.reuse, R17; /* 0x5980088005c75211 */ + /*18f8*/ FFMA R16, R82, R93.reuse, R16; /* 0x5980080005d75210 */ + /* 0x181fc480fe2607f1 */ + /*1908*/ FFMA R18, R80.reuse, R93.reuse, R18; /* 0x5980090005d75012 */ + /*1910*/ FFMA R19, R80, R92.reuse, R19; /* 0x5980098005c75013 */ + /*1918*/ FFMA R21, R83.reuse, R92.reuse, R21; /* 0x59800a8005c75315 */ + /* 0x101fc4c0fe2407f1 */ + /*1928*/ FFMA R20, R83, R93.reuse, R20; /* 0x59800a0005d75314 */ + /*1930*/ FFMA R22, R81.reuse, R93.reuse, R22; /* 0x59800b0005d75116 */ + /*1938*/ FFMA R23, R81, R92.reuse, R23; /* 0x59800b8005c75117 */ + /* 0x181fc480fe2607f1 */ + /*1948*/ FFMA R49, R86.reuse, R92.reuse, R49; /* 0x5980188005c75631 */ + /*1950*/ FFMA R48, R86, R93.reuse, R48; /* 0x5980180005d75630 */ + /*1958*/ FFMA R50, R84.reuse, R93.reuse, R50; /* 0x5980190005d75432 */ + /* 0x181fc4c0fe2407f1 */ + /*1968*/ FFMA R51, R84, R92.reuse, R51; /* 0x5980198005c75433 */ + /*1970*/ FFMA R53, R87.reuse, R92.reuse, R53; /* 0x59801a8005c75735 */ + /*1978*/ FFMA R52, R87.reuse, R93.reuse, R52; /* 0x59801a0005d75734 */ + /* 0x181fc440fe2207f1 */ + /*1988*/ FFMA R54, R85.reuse, R93, R54; /* 0x59801b0005d75536 */ + /*1990*/ FFMA R55, R85.reuse, R92, R55; /* 0x59801b8005c75537 */ + /*1998*/ FFMA R61, R87.reuse, R94.reuse, R61; /* 0x59801e8005e7573d */ + /* 0x101fc4c0fe2407f1 */ + /*19a8*/ FFMA R60, R87, R95.reuse, R60; /* 0x59801e0005f7573c */ + /*19b0*/ FFMA R62, R85.reuse, R95.reuse, R62; /* 0x59801f0005f7553e */ + /*19b8*/ FFMA R63, R85, R94.reuse, R63; /* 0x59801f8005e7553f */ + /* 0x181fc480fe2607f1 */ + /*19c8*/ FFMA R57, R86.reuse, R94.reuse, R57; /* 0x59801c8005e75639 */ + /*19d0*/ FFMA R56, R86, R95.reuse, R56; /* 0x59801c0005f75638 */ + /*19d8*/ FFMA R58, R84.reuse, R95.reuse, R58; /* 0x59801d0005f7543a */ + /* 0x101fc4c0fe2407f1 */ + /*19e8*/ FFMA R59, R84, R94.reuse, R59; /* 0x59801d8005e7543b */ + /*19f0*/ FFMA R29, R83.reuse, R94.reuse, R29; /* 0x59800e8005e7531d */ + /*19f8*/ FFMA R28, R83, R95.reuse, R28; /* 0x59800e0005f7531c */ + /* 0x181fc480fe2607f1 */ + /*1a08*/ FFMA R30, R81.reuse, R95.reuse, R30; /* 0x59800f0005f7511e */ + /*1a10*/ FFMA R31, R81, R94.reuse, R31; /* 0x59800f8005e7511f */ + /*1a18*/ FFMA R25, R82.reuse, R94.reuse, R25; /* 0x59800c8005e75219 */ + /* 0x001fc440fe2407f1 */ + /*1a28*/ FFMA R24, R82, R95.reuse, R24; /* 0x59800c0005f75218 */ + /*1a30*/ FFMA R26, R80.reuse, R95, R26; /* 0x59800d0005f7501a */ + /*1a38*/ FFMA R27, R80, R94, R27; /* 0x59800d8005e7501b */ + /* 0x101fc480fe2407f1 */ + /*1a48*/ @P0 IADD R112, R112, R121.reuse; /* 0x5c10000007907070 */ + /*1a50*/ @P0 IADD R116, R116, R121.reuse; /* 0x5c10000007907474 */ + /*1a58*/ @P0 IADD R120, R120, R121.reuse; /* 0x5c10000007907878 */ + /* 0x081fc400fca007f0 */ + /*1a68*/ { @P0 IADD R124, R124, R121; /* 0x5c10000007907c7c */ + /*1a70*/ @P0 BRA 0x310; } /* 0xe2400ffe8980000f */ + /*1a78*/ SHR.U32 R80, R123.reuse, 0x1; /* 0x3828000000177b50 */ + /* 0x001fc480fe2007f1 */ + /*1a88*/ MOV R81, c[0x0][0x158]; /* 0x4c98078005670051 */ + /*1a90*/ ISCADD R84, R125, R126.reuse, 0x6; /* 0x5c18030007e77d54 */ + /*1a98*/ MOV R72, c[0x0][0x15c]; /* 0x4c98078005770048 */ + /* 0x001fc400fe2007f1 */ + /*1aa8*/ ISCADD R92, R123, R126, 0x3; /* 0x5c18018007e77b5c */ + /*1ab0*/ LOP.AND R114, R114, 0x7ff; /* 0x384700007ff77272 */ + /*1ab8*/ ISCADD R80, R122, R80, 0x6; /* 0x5c18030005077a50 */ + /* 0x001fc440fe2007f1 */ + /*1ac8*/ LOP.AND R115, R115, 0x7ff; /* 0x384700007ff77373 */ + /*1ad0*/ SHL R77, R81.reuse, 0x2; /* 0x384800000027514d */ + /*1ad8*/ ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT; /* 0x4b6303800517542f */ + /* 0x081fc400fe2207f1 */ + /*1ae8*/ SHL R89, R81.reuse, 0x4; /* 0x3848000000475159 */ + /*1af0*/ FMUL R64, R3, R72; /* 0x5c68000004870340 */ + /*1af8*/ SHL R91, R81.reuse, 0x5; /* 0x384800000057515b */ + /* 0x001fc400fe2607f1 */ + /*1b08*/ XMAD.MRG R74, R80.reuse, R81.H1.reuse, RZ; /* 0x5b007fa80517504a */ + /*1b10*/ ISCADD R93, R115, R114, 0x4; /* 0x5c1802000727735d */ + /*1b18*/ XMAD R73, R80, R81, R84; /* 0x5b002a0005175049 */ + /* 0x001fc400fe2007f1 */ + /*1b28*/ SHL R92, R92, 0x2; /* 0x3848000000275c5c */ + /*1b30*/ IADD R84, R84, 0x20; /* 0x3810000002075454 */ + /*1b38*/ ISCADD R85, R81, -R89, 0x7; /* 0x5c19038005975155 */ + /* 0x001fc480fe2407f1 */ + /*1b48*/ FMUL R65, R7, R72.reuse; /* 0x5c68000004870741 */ + /*1b50*/ FMUL R66, R1, R72.reuse; /* 0x5c68000004870142 */ + /*1b58*/ XMAD.PSL.CBCC R73, R80.H1, R74.H1, R73; /* 0x5b30249804a75049 */ + /* 0x101fc400fe2007f1 */ + /*1b68*/ IADD R80, R80, -0x1; /* 0x3910007ffff75050 */ + /*1b70*/ ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT; /* 0x4b63038005175437 */ + /*1b78*/ FMUL R67, R5, R72.reuse; /* 0x5c68000004870543 */ + /* 0x001fc480fe2407f1 */ + /*1b88*/ FMUL R68, R35, R72.reuse; /* 0x5c68000004872344 */ + /*1b90*/ FMUL R69, R39, R72.reuse; /* 0x5c68000004872745 */ + /*1b98*/ ISCADD R76, R73, c[0x0][0x140], 0x2; /* 0x4c1801000507494c */ + /* 0x001fc440fe2207f1 */ + /*1ba8*/ IADD R86, R80.reuse, 0x4; /* 0x3810000000475056 */ + /*1bb0*/ IADD R87, R80.reuse, 0x8; /* 0x3810000000875057 */ + /*1bb8*/ IADD R88, R80, 0xc; /* 0x3810000000c75058 */ + /* 0x001f9800fe2407f1 */ + /*1bc8*/ FMUL R70, R33, R72.reuse; /* 0x5c68000004872146 */ + /*1bd0*/ FMUL R71, R37, R72; /* 0x5c68000004872547 */ + /*1bd8*/ IADD R76, R76, -R77; /* 0x5c11000004d74c4c */ + /* 0x001fc080fca207f1 */ + /*1be8*/ IADD R75, R76.reuse, R89; /* 0x5c10000005974c4b */ + /*1bf0*/ IADD R78, R76, R91.reuse; /* 0x5c10000005b74c4e */ + /*1bf8*/ { IADD R79, R75, R91; /* 0x5c10000005b74b4f */ + /*1c08*/ CAL 0x1f10; } /* 0x101fc482fe2007f5 */ + /* 0xe260000030000040 */ + /*1c10*/ FMUL R64, R2, R72.reuse; /* 0x5c68000004870240 */ + /*1c18*/ FMUL R65, R6, R72.reuse; /* 0x5c68000004870641 */ + /* 0x101fc480fe2407f1 */ + /*1c28*/ FMUL R66, R0, R72.reuse; /* 0x5c68000004870042 */ + /*1c30*/ FMUL R67, R4, R72.reuse; /* 0x5c68000004870443 */ + /*1c38*/ FMUL R68, R34, R72.reuse; /* 0x5c68000004872244 */ + /* 0x001fc080fe2407f1 */ + /*1c48*/ FMUL R69, R38, R72.reuse; /* 0x5c68000004872645 */ + /*1c50*/ FMUL R70, R32, R72.reuse; /* 0x5c68000004872046 */ + /*1c58*/ { FMUL R71, R36, R72; /* 0x5c68000004872447 */ + /*1c68*/ CAL 0x1f10; } /* 0x101fc482fe2007f5 */ + /* 0xe26000002a000040 */ + /*1c70*/ FMUL R64, R11, R72.reuse; /* 0x5c68000004870b40 */ + /*1c78*/ FMUL R65, R15, R72.reuse; /* 0x5c68000004870f41 */ + /* 0x101fc480fe2407f1 */ + /*1c88*/ FMUL R66, R9, R72.reuse; /* 0x5c68000004870942 */ + /*1c90*/ FMUL R67, R13, R72.reuse; /* 0x5c68000004870d43 */ + /*1c98*/ FMUL R68, R43, R72.reuse; /* 0x5c68000004872b44 */ + /* 0x001fc080fe2407f1 */ + /*1ca8*/ FMUL R69, R47, R72.reuse; /* 0x5c68000004872f45 */ + /*1cb0*/ FMUL R70, R41, R72.reuse; /* 0x5c68000004872946 */ + /*1cb8*/ { FMUL R71, R45, R72; /* 0x5c68000004872d47 */ + /*1cc8*/ CAL 0x1f10; } /* 0x101fc482fe2007f5 */ + /* 0xe260000024000040 */ + /*1cd0*/ FMUL R64, R10, R72.reuse; /* 0x5c68000004870a40 */ + /*1cd8*/ FMUL R65, R14, R72.reuse; /* 0x5c68000004870e41 */ + /* 0x101fc480fe2407f1 */ + /*1ce8*/ FMUL R66, R8, R72.reuse; /* 0x5c68000004870842 */ + /*1cf0*/ FMUL R67, R12, R72.reuse; /* 0x5c68000004870c43 */ + /*1cf8*/ FMUL R68, R42, R72.reuse; /* 0x5c68000004872a44 */ + /* 0x001fc080fe2407f1 */ + /*1d08*/ FMUL R69, R46, R72.reuse; /* 0x5c68000004872e45 */ + /*1d10*/ FMUL R70, R40, R72.reuse; /* 0x5c68000004872846 */ + /*1d18*/ { FMUL R71, R44, R72; /* 0x5c68000004872c47 */ + /*1d28*/ CAL 0x1f10; } /* 0x001fc400fe2007f5 */ + /* 0xe26000001e000040 */ + /*1d30*/ IADD R80, R80, 0x1c; /* 0x3810000001c75050 */ + /*1d38*/ IADD R86, R86, 0x1c; /* 0x3810000001c75656 */ + /* 0x105fc400fe2007f1 */ + /*1d48*/ IADD R87, R87, 0x1c; /* 0x3810000001c75757 */ + /*1d50*/ IADD R88, R88, 0x1c; /* 0x3810000001c75858 */ + /*1d58*/ IADD R76, R76, R85.reuse; /* 0x5c10000005574c4c */ + /* 0x001fc480fe2407f1 */ + /*1d68*/ IADD R75, R75, R85.reuse; /* 0x5c10000005574b4b */ + /*1d70*/ IADD R78, R78, R85.reuse; /* 0x5c10000005574e4e */ + /*1d78*/ IADD R79, R79, R85; /* 0x5c10000005574f4f */ + /* 0x101fc480fe2407f1 */ + /*1d88*/ FMUL R64, R19, R72.reuse; /* 0x5c68000004871340 */ + /*1d90*/ FMUL R65, R23, R72.reuse; /* 0x5c68000004871741 */ + /*1d98*/ FMUL R66, R17, R72.reuse; /* 0x5c68000004871142 */ + /* 0x101fc480fe2407f1 */ + /*1da8*/ FMUL R67, R21, R72.reuse; /* 0x5c68000004871543 */ + /*1db0*/ FMUL R68, R51, R72.reuse; /* 0x5c68000004873344 */ + /*1db8*/ FMUL R69, R55, R72.reuse; /* 0x5c68000004873745 */ + /* 0x001fd400fe0407f1 */ + /*1dc8*/ FMUL R70, R49, R72.reuse; /* 0x5c68000004873146 */ + /*1dd0*/ { FMUL R71, R53, R72; /* 0x5c68000004873547 */ + /*1dd8*/ CAL 0x1f10; } /* 0xe260000013000040 */ + /* 0x101fc480fe2417f1 */ + /*1de8*/ FMUL R64, R18, R72.reuse; /* 0x5c68000004871240 */ + /*1df0*/ FMUL R65, R22, R72.reuse; /* 0x5c68000004871641 */ + /*1df8*/ FMUL R66, R16, R72.reuse; /* 0x5c68000004871042 */ + /* 0x101fc480fe2407f1 */ + /*1e08*/ FMUL R67, R20, R72.reuse; /* 0x5c68000004871443 */ + /*1e10*/ FMUL R68, R50, R72.reuse; /* 0x5c68000004873244 */ + /*1e18*/ FMUL R69, R54, R72.reuse; /* 0x5c68000004873645 */ + /* 0x001fd400fe0407f1 */ + /*1e28*/ FMUL R70, R48, R72.reuse; /* 0x5c68000004873046 */ + /*1e30*/ { FMUL R71, R52, R72; /* 0x5c68000004873447 */ + /*1e38*/ CAL 0x1f10; } /* 0xe26000000d000040 */ + /* 0x101fc480fe2417f1 */ + /*1e48*/ FMUL R64, R27, R72.reuse; /* 0x5c68000004871b40 */ + /*1e50*/ FMUL R65, R31, R72.reuse; /* 0x5c68000004871f41 */ + /*1e58*/ FMUL R66, R25, R72.reuse; /* 0x5c68000004871942 */ + /* 0x101fc480fe2407f1 */ + /*1e68*/ FMUL R67, R29, R72.reuse; /* 0x5c68000004871d43 */ + /*1e70*/ FMUL R68, R59, R72.reuse; /* 0x5c68000004873b44 */ + /*1e78*/ FMUL R69, R63, R72.reuse; /* 0x5c68000004873f45 */ + /* 0x001fd400fe0407f1 */ + /*1e88*/ FMUL R70, R57, R72.reuse; /* 0x5c68000004873946 */ + /*1e90*/ { FMUL R71, R61, R72; /* 0x5c68000004873d47 */ + /*1e98*/ CAL 0x1f10; } /* 0xe260000007000040 */ + /* 0x101fc480fe2417f1 */ + /*1ea8*/ FMUL R64, R26, R72.reuse; /* 0x5c68000004871a40 */ + /*1eb0*/ FMUL R65, R30, R72.reuse; /* 0x5c68000004871e41 */ + /*1eb8*/ FMUL R66, R24, R72.reuse; /* 0x5c68000004871842 */ + /* 0x101fc480fe2407f1 */ + /*1ec8*/ FMUL R67, R28, R72.reuse; /* 0x5c68000004871c43 */ + /*1ed0*/ FMUL R68, R58, R72.reuse; /* 0x5c68000004873a44 */ + /*1ed8*/ FMUL R69, R62, R72.reuse; /* 0x5c68000004873e45 */ + /* 0x001fd400fe0407f1 */ + /*1ee8*/ FMUL R70, R56, R72.reuse; /* 0x5c68000004873846 */ + /*1ef0*/ { FMUL R71, R60, R72; /* 0x5c68000004873c47 */ + /*1ef8*/ CAL 0x1f10; } /* 0xe260000001000040 */ + /* 0x001fc400fe0007f5 */ + /*1f08*/ EXIT; /* 0xe30000000007000f */ + /*1f10*/ { IADD R80, R80, 0x1; /* 0x3810000000175050 */ + /*1f18*/ STS.128 [R93], R64; } /* 0xef5e000000075d40 */ + /* 0x001fc000fe2007f0 */ + /*1f28*/ { IADD R86, R86, 0x1; /* 0x3810000000175656 */ + /*1f30*/ STS.128 [R93+0x80], R68; } /* 0xef5e000008075d44 */ + /*1f38*/ { IADD R87, R87, 0x1; /* 0x3810000000175757 */ + /*1f48*/ LDS R64, [R92]; } /* 0x001fc400fe0007f1 */ + /* 0xef4c000000075c40 */ + /*1f50*/ { IADD R88, R88, 0x1; /* 0x3810000000175858 */ + /*1f58*/ LDS R65, [R92+0x80]; } /* 0xef4c000008075c41 */ + /* 0x101fc000fe2407f0 */ + /*1f68*/ { IADD R76, R76, R77.reuse; /* 0x5c10000004d74c4c */ + /*1f70*/ LDS R66, [R92+0x100]; } /* 0xef4c000010075c42 */ + /*1f78*/ { IADD R75, R75, R77.reuse; /* 0x5c10000004d74b4b */ + /*1f88*/ LDS R67, [R92+0x180]; } /* 0x001fc480fe0007f1 */ + /* 0xef4c000018075c43 */ + /*1f90*/ { IADD R78, R78, R77.reuse; /* 0x5c10000004d74e4e */ + /*1f98*/ LDS R68, [R92+0x200]; } /* 0xef4c000020075c44 */ + /* 0x081fc000fe2007f0 */ + /*1fa8*/ { IADD R79, R79, R77; /* 0x5c10000004d74f4f */ + /*1fb0*/ LDS R69, [R92+0x280]; } /* 0xef4c000028075c45 */ + /*1fb8*/ { ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5; /* 0x4b63028005275007 */ + /*1fc8*/ LDS R70, [R92+0x300]; } /* 0x001c4400fe0007f1 */ + /* 0xef4c000030075c46 */ + /*1fd0*/ { ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6; /* 0x4b6303000527500f */ + /*1fd8*/ LDS R71, [R92+0x380]; } /* 0xef4c000038075c47 */ + /* 0x003fc400fd2207f2 */ + /*1fe8*/ ISETP.LT.AND P2, PT, R86.reuse, c[0x0][0x148], P5; /* 0x4b63028005275617 */ + /*1ff0*/ ISETP.LT.AND P3, PT, R86, c[0x0][0x148], P6; /* 0x4b6303000527561f */ + /*1ff8*/ @P0 STG.CG [R76], R64; /* 0xeedc400000004c40 */ + /* 0x001fc000fe2207f0 */ + /*2008*/ { ISETP.LT.AND P0, PT, R87.reuse, c[0x0][0x148], P5; /* 0x4b63028005275707 */ + /*2010*/ @P1 STG.CG [R76+0x80], R65; } /* 0xeedc400008014c41 */ + /*2018*/ { ISETP.LT.AND P1, PT, R87, c[0x0][0x148], P6; /* 0x4b6303000527570f */ + /*2028*/ @P2 STG.CG [R75], R66; } /* 0x001fc440fe2007f1 */ + /* 0xeedc400000024b42 */ + /*2030*/ ISETP.LT.AND P2, PT, R88.reuse, c[0x0][0x148], P5; /* 0x4b63028005275817 */ + /*2038*/ @P3 STG.CG [R75+0x80], R67; /* 0xeedc400008034b43 */ + /* 0x001fc400fe2007e9 */ + /*2048*/ ISETP.LT.AND P3, PT, R88, c[0x0][0x148], P6; /* 0x4b6303000527581f */ + /*2050*/ @P0 STG.CG [R78], R68; /* 0xeedc400000004e44 */ + /*2058*/ @P1 STG.CG [R78+0x80], R69; /* 0xeedc400008014e45 */ + /* 0x001fd4003e2007f2 */ + /*2068*/ @P2 STG.CG [R79], R70; /* 0xeedc400000024f46 */ + /*2070*/ @P3 STG.CG [R79+0x80], R71; /* 0xeedc400008034f47 */ + /*2078*/ RET; /* 0xe32000000007000f */ + /* 0x001f8000fc0007ff */ + /*2088*/ BRA 0x2088; /* 0xe2400fffff87000f */ + /*2090*/ NOP; /* 0x50b0000000070f00 */ + /*2098*/ NOP; /* 0x50b0000000070f00 */ + /* 0x001f8000fc0007e0 */ + /*20a8*/ NOP; /* 0x50b0000000070f00 */ + /*20b0*/ NOP; /* 0x50b0000000070f00 */ + /*20b8*/ NOP; /* 0x50b0000000070f00 */ + ................................ + + diff --git a/Assembler/PascalAs/t/MaxAs-MaxAs.t b/Assembler/PascalAs/t/MaxAs-MaxAs.t new file mode 100644 index 0000000..ad9e988 --- /dev/null +++ b/Assembler/PascalAs/t/MaxAs-MaxAs.t @@ -0,0 +1,5 @@ +use strict; +use warnings; + +use Test::More tests => 1; +BEGIN { use_ok('MaxAs::MaxAs') }; diff --git a/Kernel/Convolution/Kepler/Makefile b/Kernel/Convolution/Kepler/Makefile new file mode 100644 index 0000000..8f5ee71 --- /dev/null +++ b/Kernel/Convolution/Kepler/Makefile @@ -0,0 +1,28 @@ +BINS := sconv_fprop_K64_N64 sconv_bprop_C64_N64 sconv_update_C128_K128 \ + sconv_bprop_C1_N64 sconv_fprop_K128_N128 sconv_bprop_C128_N128 +TARGETS := $(addsuffix .cubin, $(BINS)) +TEMPLATES := $(addsuffix _template.cubin, $(BINS)) + +all: $(BINS) sconv_fprop sconv_bprop sconv_update + +$(BINS): + nvcc -arch sm_35 -m 64 $@.cu -cubin -O3 -o $@_template.cubin + KeplerAs.pl -i $@.sass $@_template.cubin $@.cubin + +sconv_fprop: sconv_fprop.cu + nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart + +sconv_bprop: sconv_bprop.cu + nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart + +sconv_update: sconv_update.cu + nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart + +clean: + rm $(TARGETS) $(TEMPLATES) sconv_fprop sconv_bprop sconv_update + +.PHONY: + all clean + +#utils +print-% : ; $(info $* is $(flavor $*) variable set to [$($*)]) @true diff --git a/Kernel/Convolution/Kepler/sconv.h b/Kernel/Convolution/Kepler/sconv.h new file mode 100644 index 0000000..f98ffad --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv.h @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +std::map nervana_kernels; +std::vector nervana_modules; + +int len_d2b(int n) { + int i, j = 0; + i = n; + while (i) { + i /= 2; + j++; + } + return j; +} + +void magic32(unsigned int nmax, unsigned int d, unsigned int& m, unsigned int& p) { + long nc = ((nmax + 1) / d) * d - 1; + long nbits = len_d2b(nmax); + std::cout << "nbits " << nbits << std::endl; + for(p = 0; p < 2 * nbits + 1; p++) { + if(pow(2, p) > nc * (d - 1 - (long)(pow(2, p) - 1) % d)) { + m = (pow(2, p) + d - 1 -(long)(pow(2, p) - 1) % d) / d; + std::cout << "m " << m << std::endl; + std::cout << "p " << p << std::endl; + return; + } + } + return; +} + +void magic64(unsigned int d, unsigned int& magic, unsigned int& shift) { + // 3 is a special case that only ends up in the high bits + // if the nmax is 0xffffffff + // we can't use 0xffffffff for all cases as some return a 33 bit + // magic number + unsigned long nmax; + if(d == 3) + nmax = 0xffffffff; + else + nmax = 0x7fffffff; + magic32(nmax, d, magic, shift); + if(magic != 1) + shift -= 32; +} + +bool load_kernels(const char* const base_path_cstr) { + //better would be a vector, but there is a bug in nvcc that prevents this + // (bug report filed) + const int NUM_KERNELS = 6; + std::string names[NUM_KERNELS] = { + "sconv_fprop_K64_N64", + "sconv_fprop_K128_N128", + "sconv_bprop_C128_N128", + "sconv_bprop_C64_N64", + "sconv_bprop_C1_N64", + "sconv_update_C128_K128" + }; + + std::string base_path(base_path_cstr); + + for (int i = 0; i < NUM_KERNELS; ++i) { + std::string kernel = names[i]; + if (nervana_kernels.count(kernel) > 0) + continue; + + CUmodule module; + + std::string path = base_path + kernel + std::string(".cubin"); + CUresult res = cuModuleLoad(&module, path.c_str()); + + if (res != CUDA_SUCCESS) { + std::cerr << "Failed to load: " << kernel << " " << res << std::endl; + return false; + } + + nervana_modules.push_back(module); + + CUfunction function; + res = cuModuleGetFunction(&function, module, kernel.c_str()); + if (res != CUDA_SUCCESS) { + std::cerr << "Failed to extract: " << kernel << " " << res << std::endl; + return false; + } + + nervana_kernels.insert(std::make_pair(kernel, function)); + } + + return true; +} diff --git a/Kernel/Convolution/Kepler/sconv_bprop.cu b/Kernel/Convolution/Kepler/sconv_bprop.cu new file mode 100644 index 0000000..de2c980 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_bprop.cu @@ -0,0 +1,362 @@ +#include "sconv.h" + +bool bprop_C128_N128(float *I, const float *F, const float *O, + unsigned int N, unsigned int C, unsigned int K, + unsigned int D, unsigned int H, unsigned int W, + unsigned int R, unsigned int S, unsigned int T, + unsigned int M, unsigned int P, unsigned int Q, + unsigned int str_d, unsigned int str_h, unsigned int str_w, + unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) { + float alpha = 1.0f; + unsigned int DHW, WN, HW, HWN, DHWN, CRST, RST, RS; + unsigned int MPQ, PQ, QN, PQN, MPQN; + unsigned int magic_HW, magic_W; + unsigned int shift_HW, shift_W; + unsigned int magic_RST, magic_RS, magic_S; + unsigned int shift_RST, shift_RS, shift_S; + unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ; + unsigned int magic_str_w, magic_str_h, magic_str_d; + unsigned int shift_str_w, shift_str_h, shift_str_d; + // input + WN = W * N; + HW = H * W; + HWN = H * WN; + DHW = D * HW; + DHWN = D * HWN; + // filter + RS = R * S; + RST = T * RS; + CRST = C * RS; + // output + QN = Q * N; + PQN = P * QN; + MPQN = M * PQN; + PQ = P * Q; + MPQ = M * P * Q; + // magic numbers + magic32(MPQ, PQ, magic_PQ, shift_PQ); + magic32(PQ, Q, magic_Q, shift_Q); + magic32(CRST, RST, magic_RST, shift_RST); + magic32(RST + 32, RS, magic_RS, shift_RS); + magic32(RS + 32, S, magic_S, shift_S); + magic32(W + S - pad_w - 2, str_w, magic_str_w, shift_str_w); + magic32(H + R - pad_h - 2, str_h, magic_str_h, shift_str_h); + magic32(D + T - pad_d - 2, str_d, magic_str_d, shift_str_d); + magic32(DHW, HW, magic_HW, shift_HW); + magic32(HW, W, magic_W, shift_W); + // test param set up + float *test_param; + cudaError_t cuda_error; + cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024); + cudaMemset(test_param, 0, sizeof(float) * 1024); + void *args[45] = { + &test_param, &I, &O, &F, &alpha, + &N, &C, &M, &P, &Q, &QN, &PQN, &MPQN, + &K, &CRST, &RST, + &RS, &magic_RS, &shift_RS, + &S, &magic_S, &shift_S, + &pad_d, &pad_h, &pad_w, + &str_d, &str_h, &str_w, + &W, &HW, &WN, &HWN, &DHWN, + &magic_W, &shift_W, + &magic_HW, &shift_HW, + &R, &T, + &magic_str_w, &shift_str_w, + &magic_str_h, &shift_str_h, + &magic_str_d, &shift_str_d}; + int gridDWH = D * W * H; + int gridX = gridDWH; + int gridY = C / 128 + (C % 128 != 0); + int gridZ = N / 128 + (N % 128 != 0); + std::string kernel_name = "sconv_bprop_C128_N128"; + CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 256, 1, 1, + 128 * 8 * 4 + RST * 4 * 2 + 8, 0, args, NULL); + if (res != CUDA_SUCCESS) { + std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl; + return false; + } + cuCtxSynchronize(); + float* h_test = (float *)malloc(sizeof(float) * 128); + for (int i = 0; i < 128; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 128, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 128; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + // free test_param + free(h_test); + return true; +} + +bool bprop_C64_N64(float *I, const float *F, const float *O, + unsigned int N, unsigned int C, unsigned int K, + unsigned int D, unsigned int H, unsigned int W, + unsigned int R, unsigned int S, unsigned int T, + unsigned int M, unsigned int P, unsigned int Q, + unsigned int str_d, unsigned int str_h, unsigned int str_w, + unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) { + float alpha = 1.0f; + unsigned int DHW, WN, HW, HWN, DHWN, CRST, RST, RS; + unsigned int MPQ, PQ, QN, PQN, MPQN; + unsigned int magic_HW, magic_W; + unsigned int shift_HW, shift_W; + unsigned int magic_RST, magic_RS, magic_S; + unsigned int shift_RST, shift_RS, shift_S; + unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ; + unsigned int magic_str_w, magic_str_h, magic_str_d; + unsigned int shift_str_w, shift_str_h, shift_str_d; + // input + WN = W * N; + HW = H * W; + HWN = H * WN; + DHW = D * HW; + DHWN = D * HWN; + // filter + RS = R * S; + RST = T * RS; + CRST = C * RS; + // output + QN = Q * N; + PQN = P * QN; + MPQN = M * PQN; + PQ = P * Q; + MPQ = M * P * Q; + // magic numbers + magic32(MPQ, PQ, magic_PQ, shift_PQ); + magic32(PQ, Q, magic_Q, shift_Q); + magic32(CRST, RST, magic_RST, shift_RST); + magic32(RST + 32, RS, magic_RS, shift_RS); + magic32(RS + 32, S, magic_S, shift_S); + magic32(W + S - pad_w - 2, str_w, magic_str_w, shift_str_w); + magic32(H + R - pad_h - 2, str_h, magic_str_h, shift_str_h); + magic32(D + T - pad_d - 2, str_d, magic_str_d, shift_str_d); + magic32(DHW, HW, magic_HW, shift_HW); + magic32(HW, W, magic_W, shift_W); + // test param set up + float *test_param; + cudaError_t cuda_error; + cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024); + cudaMemset(test_param, 0, sizeof(float) * 1024); + void *args[45] = { + &test_param, &I, &O, &F, &alpha, + &N, &C, &M, &P, &Q, &QN, &PQN, &MPQN, + &K, &CRST, &RST, + &RS, &magic_RS, &shift_RS, + &S, &magic_S, &shift_S, + &pad_d, &pad_h, &pad_w, + &str_d, &str_h, &str_w, + &W, &HW, &WN, &HWN, &DHWN, + &magic_W, &shift_W, + &magic_HW, &shift_HW, + &R, &T, + &magic_str_w, &shift_str_w, + &magic_str_h, &shift_str_h, + &magic_str_d, &shift_str_d}; + int gridDWH = D * W * H; + int gridX = gridDWH; + int gridY = C / 64 + (C % 64 != 0); + int gridZ = N / 64 + (N % 64 != 0); + std::string kernel_name = "sconv_bprop_C64_N64"; + CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 64, 1, 1, + 0, 0, args, NULL); + if (res != CUDA_SUCCESS) { + std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl; + return false; + } + cuCtxSynchronize(); + float* h_test = (float *)malloc(sizeof(float) * 64); + for (int i = 0; i < 64; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 64, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 64; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + // free test_param + free(h_test); + return true; +} + +bool bprop_C1_N64(float *I, const float *F, const float *O, + unsigned int N, unsigned int C, unsigned int K, + unsigned int D, unsigned int H, unsigned int W, + unsigned int R, unsigned int S, unsigned int T, + unsigned int M, unsigned int P, unsigned int Q, + unsigned int str_d, unsigned int str_h, unsigned int str_w, + unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) { + float alpha = 1.0f; + unsigned int WN, HWN, DHWN, CRST, RST, RS; + unsigned int MPQ, PQ, QN, PQN, MPQN; + unsigned int magic_RST, magic_RS, magic_S; + unsigned int shift_RST, shift_RS, shift_S; + unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ; + unsigned int magic_str_w, magic_str_h, magic_str_d; + unsigned int shift_str_w, shift_str_h, shift_str_d; + unsigned int CRST32, MPQN32; + // input + WN = W * N; + HWN = H * WN; + DHWN = D * HWN; + // filter + RS = R * S; + RST = T * RS; + CRST = C * RS; + // output + QN = Q * N; + PQN = P * QN; + MPQN = M * PQN; + PQ = P * Q; + MPQ = M * PQ; + // special + CRST32 = 32 * CRST; + MPQN32 = 32 * MPQN; + // magic numbers + magic32(MPQ, PQ, magic_PQ, shift_PQ); + magic32(PQ, Q, magic_Q, shift_Q); + magic32(CRST, RST, magic_RST, shift_RST); + magic32(RST + 32, RS, magic_RS, shift_RS); + magic32(RS + 32, S, magic_S, shift_S); + magic32(W + S - pad_w - 2, str_w, magic_str_w, shift_str_w); + magic32(H + R - pad_h - 2, str_h, magic_str_h, shift_str_h); + magic32(D + T - pad_d - 2, str_d, magic_str_d, shift_str_d); + // test param set up + float *test_param; + cudaError_t cuda_error; + cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024); + cudaMemset(test_param, 0, sizeof(float) * 1024); + void *args[41] = { + &test_param, &I, &O, &F, &alpha, + &N, &K, &D, &H, &W, &WN, &HWN, &DHWN, + &C, &CRST, + &RST, &magic_RST, &shift_RST, + &RS, &magic_RS, &shift_RS, + &S, &magic_S, &shift_S, + &pad_d, &pad_h, &pad_w, + &str_d, &str_h, &str_w, + &Q, &PQ, &QN, &PQN, &MPQN, + &magic_Q, &shift_Q, + &magic_PQ, &shift_PQ, + &CRST32, + &MPQN32}; + int gridMPQ = MPQ; + int gridX = gridMPQ; + int gridY = CRST / 32 + (CRST % 32 != 0); + int gridZ = N / 64 + (N % 64 != 0); + const std::string kernel_name = "sconv_bprop_C1_N64"; + CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 32, 1, 1, + 0, 0, args, NULL); + if (res != CUDA_SUCCESS) { + std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl; + return false; + } + cuCtxSynchronize(); + float* h_test = (float *)malloc(sizeof(float) * 32); + cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 32, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 32; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + // free test_param + free(h_test); + return true; +} + +int main(int argc, char** argv) { + // init + cudaFree(0); + // params + float *d_I, *d_F, *d_O; + unsigned int N = 128, C = 192, K = 192, D = 1, H = 13, W = 13, T = 1, R = 12, S = 12; + unsigned int str_d = 1, str_h = 1, str_w = 1; + unsigned int pad_d = 0, pad_h = 0, pad_w = 0; + unsigned int M, P, Q; + cudaError_t cuda_error; + // 32, 64, or 128 + if (argc > 1) { + C = atoi(argv[1]); + } + M = (D - T + 2 * pad_d) / str_d + 1; + P = (H - R + 2 * pad_h) / str_h + 1; + Q = (W - S + 2 * pad_w) / str_w + 1; + float *h_O = (float *)malloc(K * M * P * Q * N * sizeof(float)); + for (int i = 0; i < K * M * P * Q * N; ++i) { + h_O[i] = 1; + } + float *h_F = (float *)malloc(K * R * S * T * C * sizeof(float)); + for (int i = 0; i < K * C * R * S * T; ++i) { + h_F[i] = 1; + } + float* h_I = (float *)malloc(sizeof(float) * C * D * H * W * N); + // device memory + cudaMalloc((void**)&d_I, sizeof(float) * C * D * H * W * N); + cudaMalloc((void**)&d_F, sizeof(float) * K * R * S * T * C * 2); + cudaMalloc((void**)&d_O, sizeof(float) * K * M * P * Q * N); + // memcpy h_O, h_F + cudaMemcpy(d_O, h_O, sizeof(float) * M * P * Q * K * N, + cudaMemcpyHostToDevice); + cudaMemcpy(d_F, h_F, sizeof(float) * K * R * S * T * C, + cudaMemcpyHostToDevice); + // load kernels + if (!load_kernels("./")) { + std::cerr << "Couldn't load all kernels" << std::endl; + exit(1); + } + if (C % 64 != 0) { + // launch kernel C1 + if (!bprop_C1_N64(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) { + std::cerr << "Launch error C1" << std::endl; + exit(1); + } + } else { + // launch kernel C64 + if (C <= 64) { + if (!bprop_C64_N64(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) { + std::cerr << "Launch error C64" << std::endl; + exit(1); + } + } else { + if (!bprop_C128_N128(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) { + std::cerr << "Launch error C128" << std::endl; + exit(1); + } + } + } + // output + std::cout << "result" << std::endl; + cuda_error = cudaMemcpy(h_I, d_I, sizeof(float) * C * D * H * W * N, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 128; ++i) { + std::cout << h_I[i] << " "; + } + std::cout << std::endl; + // free memory + free(h_O); + free(h_I); + free(h_F); + cudaFree(d_I); + cudaFree(d_F); + cudaFree(d_O); + // run successfully + std::cout << "finish" << std::endl; + return 0; +} diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.cu b/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.cu new file mode 100644 index 0000000..dddde07 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.cu @@ -0,0 +1,56 @@ +extern "C" +__global__ void sconv_bprop_C128_N128 ( + float* param_test, + float* param_O, + const float* param_I, + const float* param_F, + float param_alpha, + int param_N, + int param_K, + int param_D, + int param_H, + int param_W, + int param_WN, + int param_HWN, + int param_DHWN, + int param_C, + int param_CRST, + int param_RST, + int param_RS, + int param_magic_RS, + int param_shift_RS, + int param_S, + int param_magic_S, + int param_shift_S, + int param_pad_d, + int param_pad_h, + int param_pad_w, + int param_str_d, + int param_str_h, + int param_str_w, + int param_Q, + int param_PQ, + int param_QN, + int param_PQN, + int param_MPQN, + int param_magic_Q, + int param_shift_Q, + int param_magic_PQ, + int param_shift_PQ, + int param_R, + int param_T, + int param_magic_str_w, + int param_shift_str_w, + int param_magic_str_h, + int param_shift_str_h, + int param_magic_str_d, + int param_shift_str_d) { + __shared__ float share[128 * 8 * 4 + 8]; + + int tid = threadIdx.x; + + share[tid] = 1; + + *param_O = share[127-tid]; + *param_test = share[127-tid]; +} diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.sass b/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.sass new file mode 100644 index 0000000..c7cb6e5 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.sass @@ -0,0 +1,785 @@ +# Kernel: sconv_bprop_C128_N128 +// debug: +// mode1 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +//-:-:-:-:00 SHL tmp_shl, tid, 0x2; +//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +//-:-:-:-:00 I2F.F32.U32 rst, rst; +//-:-:-:-:00 ST.E [tmp_param00], rst; +//-:-:-:-:00 EXIT; + +// mode2 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +// +//-:-:-:-:00 MOV32I k, 0x40000000; +//-:-:-:-:00 ST.E [tmp_param0], k; +//-:-:-:-:00 EXIT; + +// modify steps: +// XMAD->IMAD +// shared memory addresses->RZ +// LDG->LD +// LEA->MOV, IADD, SHL +// XMAD.LO2C->IMAD.U32.U32 +// XMAD.PSL->IMAD.U32.U32 +// VMAD->IMAD, IADD +// MOV->MOV32I +// IADD3->IADD, IADD +// POPC +// ST.CG->ST +// control code +// comments +// LDS.U->LDS +// register<0-7>->register<0-3>, register<4-7> +// avoid register conflicts +// tid->other register + +// optimization steps: +// alexnet2 +// initial->1400 +// bank conflict->1400 +// alignment+dual issue+reuse->2100 +// all ldg.128->1900 +// control codes->2000 +// reduce unnecessary instructions->2100 +// scheduling->1937 + + + szShareF : (128*8) + szShareI : (128*8) + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + addr_m : 4x<128*8*2 + 128*8*2 + 4> + addr_p : 4x<128*8*2 + 128*8*2 + 5> + addr_q : 4x<128*8*2 + 128*8*2 + 6> + addr_szLut : 4x<128*8*2 + 128*8*2 + 7> + addr_lut : 4x<128*8*2 + 128*8*2 + 8> + + param_test[0] : c[0x0][0x140] + param_test[1] : c[0x0][0x144] + param_O[0] : c[0x0][0x148] + param_O[1] : c[0x0][0x14c] + param_I[0] : c[0x0][0x150] + param_I[1] : c[0x0][0x154] + param_F[0] : c[0x0][0x158] + param_F[1] : c[0x0][0x15c] + param_alpha : c[0x0][0x160] + param_N : c[0x0][0x164] + param_K : c[0x0][0x168] + param_D : c[0x0][0x16c] + param_H : c[0x0][0x170] + param_W : c[0x0][0x174] + param_WN : c[0x0][0x178] + param_HWN : c[0x0][0x17c] + param_DHWN : c[0x0][0x180] + param_C : c[0x0][0x184] + param_KRST : c[0x0][0x188] + param_RST : c[0x0][0x18c] + param_RS : c[0x0][0x190] + param_magic_RS : c[0x0][0x194] + param_shift_RS : c[0x0][0x198] + param_S : c[0x0][0x19c] + param_magic_S : c[0x0][0x1a0] + param_shift_S : c[0x0][0x1a4] + param_pad_d : c[0x0][0x1a8] + param_pad_h : c[0x0][0x1ac] + param_pad_w : c[0x0][0x1b0] + param_str_d : c[0x0][0x1b4] + param_str_h : c[0x0][0x1b8] + param_str_w : c[0x0][0x1bc] + param_Q : c[0x0][0x1c0] + param_PQ : c[0x0][0x1c4] + param_QN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_MPQN : c[0x0][0x1d0] + param_magic_Q : c[0x0][0x1d4] + param_shift_Q : c[0x0][0x1d8] + param_magic_PQ : c[0x0][0x1dc] + param_shift_PQ : c[0x0][0x1e0] + param_R : c[0x0][0x1e4] + param_T : c[0x0][0x1e8] + param_magic_str_w : c[0x0][0x1ec] + param_shift_str_w : c[0x0][0x1f0] + param_magic_str_h : c[0x0][0x1f4] + param_shift_str_h : c[0x0][0x1f8] + param_magic_str_d : c[0x0][0x1fc] + param_shift_str_d : c[0x0][0x200] + + + + + 64-67 : mpq<0-3> + 64-67 : m, p, q, tidY + 68-72 : blkF, blkI, blkMPQ, tid1, tidX + 73-107 ~ str_d, str_h, str_w, pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, one, rst_prime, x_prime, y_prime, z_prime, ballot, warp_slices, partial, endCRST + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 : j0Fy<0-3> + 68-71 : j0Ix<0-3> + 72-75 : j0Fy<4-7> + 76-79 : j0Ix<4-7> + 80-83 : j1Fy<0-3> + 84-87 : j1Ix<0-3> + 88-91 : j1Fy<4-7> + 92-95 : j1Ix<4-7> + + 96-97 : trackI<0-1> + 98-99 : trackF<0-1> + + 100-103 : loadI<0-3> + 104-107 : loadF<0-3> + 109 : readFs + 108 : readIs + + 110-114 ~ offsetIn, offsetFk, posCRST, lutSize, lutSizeRcp + 115-120 ~ writeS, posCRSTf, channel, lutOffset, offsetI, offsetF + 116-120 ~ tid128, tid, p_and + 121 : tmp_shl + + 122-123 : sliceI, sliceF + 122-123 : sliceIF<0-1> + 124-125 ~ offsetIc, offsetFc + 124-125 : tmp_param<0-1> + 124-127 ~ addressF0, addressF1, addressI0, addressI1 + + 72-79 : cs<0-7> + 80-81 : Out<0-1> + 82-125 ~ writeCs, readCs, alpha, tidOX, tidOX2, tidOY, to, k, n, MPQN1, MPQN60, MPQN, MPQN4 + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkF, SR_CTAID.Y; +-:-:-:-:00 S2R blkI, SR_CTAID.Z; +-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index + +-:-:-:-:00 ISETP.GE.AND P0, PT, tid, 32, PT; + +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + + return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15; + + +// tid <= 255 +// tidX = (tid & 31) << 2 +// tidX = 0 : 4 : 128 +// tidY = tid >> 5 +// tidY = 0 : 1 : 7 +-:-:-:-:00 LOP.AND tidX, tid, 31; +-:-:-:-:00 SHL tidX, tidX, 2; +-:-:-:-:00 SHR.U32 tidY, tid, 5; + +// trackF += blkF*128 + tidX +-:-:-:-:00 ISCADD offsetFk, blkF, tidX, 7; + +// trackI += blkI*128 + tidX +-:-:-:-:00 ISCADD offsetIn, blkI, tidX, 7; + +// writeS = (128*tidY + tidX) * 4 +-:-:-:-:00 SHR tidX, tidX, 1; +-:-:-:-:00 ISCADD writeS, tidY, tidX, 7; +-:-:-:-:00 SHL writeS, writeS, 2; + +// rieadFs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +// [6][5][4][0] * 8; +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 LOP.AND readFs, tid, 0x70; +-:-:-:-:00 SHR.U32 readFs, readFs, 3; +-:-:-:-:00 LOP.OR readFs, readFs, tid1; +-:-:-:-:00 SHL readFs, readFs, 3; + +// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7) +// [3][2][1] * 16; +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 SHR.U32 tid128, tid128, 3; +-:-:-:-:00 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 LOP.OR readIs, readIs, tid128; +-:-:-:-:00 ISCADD readIs, readIs, 4x, 3; + +-:-:-:-:00 @P0 BRA.U END_SETUP; + +-:-:-:-:00 MOV str_d, param_str_d; +-:-:-:-:00 MOV str_h, param_str_h; +-:-:-:-:00 MOV str_w, param_str_w; +-:-:-:-:00 MOV rst, tid; +-:-:-:-:00 MOV lutStore2, RZ; +-:-:-:-:00 MOV lutSize, RZ; +-:-:-:-:00 MOV32I warp_count, 32; + +// m = blkMPQ / PQ +// pq = blkMPQ % PQ +-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ; +-:-:-:-:00 SHR.U32 m, m, param_shift_PQ; +-:-:-:-:00 IMAD pq, m, param_PQ, RZ; +-:-:-:-:00 IADD pq, -pq, blkMPQ; +// p = pq / Q +// q = pq % Q +-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ; +-:-:-:-:00 SHR.U32 p, p, param_shift_Q; +-:-:-:-:00 IMAD q, p, param_Q, RZ; +-:-:-:-:00 IADD q, -q, pq; + +-:-:-:-:00 MOV32I dep_thd_mask, -1; + +-:-:-:-:00 LOP.AND p_and, p, 1; +-:-:-:-:00 ISETP.NE.AND P1, PT, p_and, RZ, PT; +-:-:-:-:00 @P1 IADD q, -q, param_Q; +-:-:-:-:00 @P1 IADD q, q, dep_thd_mask; + +-:-:-:-:00 STS.128 [RZ + addr_m], m; + +// qs = q - S + pad_w + 1 +-:-:-:-:00 MOV32I one, 1; +-:-:-:-:00 IADD qs, q, -param_S; +-:-:-:-:00 IADD qs, qs, param_pad_w; +-:-:-:-:00 IADD qs, qs, one; + +// pr = p - R + pad_h + 1 +-:-:-:-:00 IADD pr, p, -param_R; +-:-:-:-:00 IADD pr, pr, param_pad_h; +-:-:-:-:00 IADD pr, pr, one; + +// mt = m - T + pad_d + 1 +-:-:-:-:00 IADD mt, m, -param_T; +-:-:-:-:00 IADD mt, mt, param_pad_d; +-:-:-:-:00 IADD mt, mt, one; + +-:-:-:-:00 IADD mask_shr, -tid, 32; +-:-:-:-:00 SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr; + +LUT_LOOP: + +// warp synchronous loop while warp_count < RST +-:-:-:-:00 ISETP.LT.AND P0, PT, warp_count, param_RST, PT; +-:-:-:-:00 IADD warp_count, warp_count, 32; +// t = rst / RS +// rs = rst % RS +-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ; +-:-:-:-:00 SHR.U32 t, t, param_shift_RS; +-:-:-:-:00 IMAD rs, t, param_RS, RZ; +-:-:-:-:00 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ; +-:-:-:-:00 SHR.U32 r, r, param_shift_S; +-:-:-:-:00 IMAD s, r, param_S, RZ; +-:-:-:-:00 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +-:-:-:-:00 IADD x, qs, s; +-:-:-:-:00 IADD y, pr, r; +-:-:-:-:00 IADD z, mt, t; +-:-:-:-:00 ISETP.GE.AND P4, PT, x, RZ, PT; +-:-:-:-:00 ISETP.GE.AND P5, PT, y, RZ, PT; +-:-:-:-:00 ISETP.GE.AND P6, PT, z, RZ, PT; +// rst_prime = t*RS + r*S + s +// s = S - s - 1 +-:-:-:-:00 IADD s, -s, param_S; +-:-:-:-:00 IADD s, s, -one; +// r = R - r - 1 +-:-:-:-:00 IADD r, -r, param_R; +-:-:-:-:00 IADD r, r, -one; +// t = T - t - 1 +-:-:-:-:00 IADD t, -t, param_T; +-:-:-:-:00 IADD t, t, -one; + +-:-:-:-:00 IMAD rst_prime, r, param_S, s; +-:-:-:-:00 IMAD rst_prime, t, param_RS, rst_prime; + +// x_prime = x / str_w +// x = x % str_w +-:-:-:-:00 IMAD x_prime, x, param_magic_str_w, RZ; +-:-:-:-:00 SHR.U32 x_prime, x_prime, param_shift_str_w; +-:-:-:-:00 IMAD tmp_param0, str_w, x_prime, RZ; +-:-:-:-:00 IADD x, -tmp_param0, x; +// y_prime = y / str_h +// y = y % str_h +-:-:-:-:00 IMAD y_prime, y, param_magic_str_h, RZ; +-:-:-:-:00 SHR.U32 y_prime, y_prime, param_shift_str_h; +-:-:-:-:00 IMAD tmp_param0, str_h, y_prime, RZ; +-:-:-:-:00 IADD y, -tmp_param0, y; +// z_prime = z / str_d +// z = z % str_d +-:-:-:-:00 IMAD z_prime, z, param_magic_str_d, RZ; +-:-:-:-:00 SHR.U32 z_prime, z_prime, param_shift_str_d; +-:-:-:-:00 IMAD tmp_param0, str_d, z_prime, RZ; +-:-:-:-:00 IADD z, -tmp_param0, z; + +// calculate x_prime only when x % str_w == 0 +// it may be greater than Q due to its location +-:-:-:-:00 ISETP.EQ.AND P4, PT, x, RZ, P4; +-:-:-:-:00 ISETP.EQ.AND P5, PT, y, RZ, P5; +-:-:-:-:00 ISETP.EQ.AND P6, PT, z, RZ, P6; +-:-:-:-:00 ISETP.LT.AND P4, PT, x_prime, param_W, P4; +-:-:-:-:00 ISETP.LT.AND P5, PT, y_prime, param_H, P5; +-:-:-:-:00 ISETP.LT.AND P6, PT, z_prime, param_D, P6; +-:-:-:-:00 PSETP.AND.AND P1, PT, P4, P5, P6; + +// sliceI = z_prime*HWN + y_prime*WN + x_prime*N +-:-:-:-:00 IMAD sliceI, x_prime, param_N, RZ; +-:-:-:-:00 IMAD.U32.U32 sliceI, y_prime, param_WN, sliceI; +-:-:-:-:00 IMAD.U32.U32 sliceI, z_prime, param_HWN, sliceI; +// sliceF = rst_prime * K +-:-:-:-:00 IMAD sliceF, rst_prime, param_K, RZ; + +// Get a mask of all valid slices in the warp +-:-:-:-:00 VOTE.ANY ballot, PT, P1; +// Count the total valid slices +-:-:-:-:00 POPC warp_slices, ballot, ballot; +// Prepare lutStore for this and next loop +-:-:-:-:00 @P1 MOV lutStore, lutStore2; +-:-:-:-:00 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +-:-:-:-:00 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +-:-:-:-:00 @P1 POPC dep_thd_cnt, dep_thd_bits, dep_thd_bits; +// use the rst increment to space the barrier sync +-:-:-:-:00 IADD rst, rst, 32; +// Update the lutStore address from this count +-:-:-:-:00 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +-:-:-:-:00 @P1 STS.64 [lutStore + addr_lut], sliceIF; +// Keep track of the total size of the lut +-:-:-:-:00 IADD lutSize, lutSize, warp_slices; + +-:-:-:-:00 @P0 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +-:-:-:-:00 STS [RZ + addr_szLut], lutSize; + +END_SETUP: + +-:-:-:-:00 BAR.SYNC 0; + +// Grab the caclulated lut size and get it's reciprical +// Get the total reduction depth +-:-:-:-:00 LDS lutSize, [RZ + addr_szLut]; +-:-:-:-:00 IMAD endCRST, lutSize, param_C, RZ; +-:-:-:-:00 I2F.F32.S32 lutSizeRcp, lutSize; +-:-:-:-:00 MUFU.RCP lutSizeRcp, lutSizeRcp; + +// posCRST = endCRST - tidY - 1 +-:-:-:-:00 IADD posCRST, endCRST, -1; +-:-:-:-:00 IADD posCRST, posCRST, -tidY; +// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch. +// If it is a multiple of 8 then make a full 8 line fetch. +-:-:-:-:00 LOP.AND partial, endCRST, 7; +-:-:-:-:00 ISETP.EQ.AND P1, PT, RZ, partial, PT; +-:-:-:-:00 @P1 MOV32I partial, 8; +// channel = posCRST / lutSize +// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it +-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST; +-:-:-:-:00 FMUL channel, posCRSTf, lutSizeRcp; +-:-:-:-:00 FFMA channel, channel, 5.9604644775390625e-08, channel; +-:-:-:-:00 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +-:-:-:-:00 IMAD tmp_param0, channel, lutSize, RZ; +-:-:-:-:00 IADD lutOffset, -tmp_param0, posCRST; + +-:-:-:-:00 SHL lutOffset, lutOffset, 3; +// P1 = tidY < partial +-:-:-:-:00 ISETP.LT.AND P1, PT, tidY, partial, PT; +// offsetIC = channel * DHWN +// offsetFC = channel * K +-:-:-:-:00 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ; +-:-:-:-:00 IMAD offsetFc, channel, param_KRST, RZ; +// posCRST -= partial +-:-:-:-:00 IADD posCRST, posCRST, -partial; +-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut]; + +// trackI = offsetIN + offsetIC + sliceI + param_I +-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc; +-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF; +// trackF = offsetFK + offsetFC + sliceF + param_F +-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc; +-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI; + +//-:-:-:-:00 @P1 LEA trackF0.CC, offsetF, param_F[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2; +-:-:-:-:00 @P1 MOV tmp_param0, param_F[0]; +-:-:-:-:00 @P1 MOV tmp_param1, param_F[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2; +-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 @P1 IADD.X trackF1, RZ, tmp_param1; +//-:-:-:-:00 @P1 LEA trackI0.CC, offsetI, param_I[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2; +-:-:-:-:00 @P1 MOV tmp_param0, param_I[0]; +-:-:-:-:00 @P1 MOV tmp_param1, param_I[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2; +-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 @P1 IADD.X trackI1, RZ, tmp_param1; + +-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF]; +-:-:-:-:00 @!P1 LDS.128 loadF0, [RZ + addr_zero]; + +-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI]; +-:-:-:-:00 @!P1 LDS.128 loadI0, [RZ + addr_zero]; + +-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT; + +-:-:-:-:00 STS.64 [writeS], loadF0; +-:-:-:-:00 STS.64 [writeS + 4x<64>], loadF2; +-:-:-:-:00 STS.64 [writeS + 4x], loadI0; +-:-:-:-:00 STS.64 [writeS + 4x], loadI2; + +-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST; + +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 LOP.XOR writeS, writeS, 4x; + +-:-:-:-:00 LDS.64 j0Ix0, [readIs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS.64 j0Ix2, [readIs + 4x<0*128 + 64>]; +-:-:-:-:00 LDS.64 j0Fy0, [readFs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS.64 j0Fy2, [readFs + 4x<0*128 + 64>]; + +-:-:-:-:00 LDS.64 j0Ix4, [readIs + 4x<0*128 + 16>]; +-:-:-:-:00 LDS.64 j0Ix6, [readIs + 4x<0*128 + 80>]; +-:-:-:-:00 LDS.64 j0Fy4, [readFs + 4x<0*128 + 32>]; +-:-:-:-:00 LDS.64 j0Fy6, [readFs + 4x<0*128 + 96>]; + +// channel = posCRST / lutSize +-:-:-:-:00 @P1 FMUL channel, posCRSTf, lutSizeRcp; +-:-:-:-:00 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel; +-:-:-:-:00 @P1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +-:-:-:-:00 @P1 IMAD tmp_param0, channel, lutSize, RZ; +-:-:-:-:00 @P1 IADD lutOffset, -tmp_param0, posCRST; +-:-:-:-:00 @P1 SHL lutOffset, lutOffset, 3; +// offsetIC = channel * DHWN +// offsetFC = channel * K +-:-:-:-:00 @P1 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ; +-:-:-:-:00 @P1 IMAD offsetFc, channel, param_KRST, RZ; + +-:-:-:-:00 IADD posCRST, posCRST, -8; +-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut]; + +// trackI = offsetIN + offsetIC + sliceI + param_I +-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc; +-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF; +// trackF = offsetFK + offsetFC + sliceF + param_F +-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc; +-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI; +//-:-:-:-:00 @P1 LEA trackF0.CC, offsetF, param_F[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2; +-:-:-:-:00 MOV addressF0, param_F[0]; +-:-:-:-:00 MOV addressF1, param_F[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2; +-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, addressF0; +-:-:-:-:00 @P1 IADD.X trackF1, RZ, addressF1; +//-:-:-:-:00 @P1 LEA trackI0.CC, offsetI, param_I[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2; +-:-:-:-:00 MOV addressI0, param_I[0]; +-:-:-:-:00 MOV addressI1, param_I[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2; +-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, addressI0; +-:-:-:-:00 @P1 IADD.X trackI1, RZ, addressI1; +-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF]; +-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI]; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + +LOOP: + + + my %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + j0c61 => "-:-:-:-:00 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + j0c63 => "-:-:-:-:00 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + + j1c47 => "-:-:-:-:00 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j1c63 => "-:-:-:-:00 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j2c47 => "-:-:-:-:00 \@P1 IMAD lutOffset, -channel, lutSize, posCRST;\n", + j2c53 => "-:-:-:-:00 \@P1 IMAD offsetF, channel, param_KRST, offsetFk;\n", + j2c61 => "-:-:-:-:00 \@P1 IMAD offsetI, channel, param_DHWN, offsetIn;\n", + j2c62 => "-:-:-:-:00 \@P1 SHL lutOffset, lutOffset, 3;\n", + j2c63 => "-:-:-:-:00 IADD posCRST, posCRST, -8;\n", + + j3c47 => "-:-:-:-:00 \@P1 LDS.64 sliceIF, [lutOffset + addr_lut];\n", + j3c53 => "-:-:-:-:00 TEXDEPBAR 0x0;\n", + j3c61 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x], loadI0;\n", + j3c62 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x], loadI2;\n", + j3c63 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<0>], loadF0;\n", + + j4c47 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<64>], loadF2;\n", + j4c53 => "-:-:-:-:00 \@P1 IADD offsetF, offsetF, sliceF;\n", + j4c61 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetF, 0x2;\n", + j4c62 => "-:-:-:-:00 \@P1 IADD offsetI, offsetI, sliceI;\n", + j4c63 => "-:-:-:-:00 \@P1 IADD trackF0.CC, tmp_shl, addressF0;\n", + + j5c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetI, 0x2;\n", + j5c53 => "-:-:-:-:00 \@P1 IADD trackI0.CC, tmp_shl, addressI0;\n", + j5c61 => "-:-:-:-:00 \@P1 IADD.X trackF1, RZ, addressF1;\n", + j5c62 => "-:-:-:-:00 \@P1 IADD.X trackI1, RZ, addressI1;\n", + + j6c47 => "-:G:D:-:00 \@P1 LDG.E.128 loadF0, [trackF];\n", + j6c53 => "-:G:D:-:00 \@P1 LDG.E.128 loadI0, [trackI];\n", + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readIs, readIs, 4x;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readFs, readFs, 4x;\n", + j6c63 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n", + + j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x;\n", + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n", + ); + + my @cOrder; + + push @cOrder, [0,0]; + push @cOrder, [0,1]; + push @cOrder, [1,1]; + push @cOrder, [2,0]; + push @cOrder, [1,0]; + push @cOrder, [2,1]; + push @cOrder, [2,3]; + push @cOrder, [2,2]; + push @cOrder, [1,2]; + push @cOrder, [0,3]; + push @cOrder, [1,3]; + push @cOrder, [0,2]; + push @cOrder, [0,4]; + push @cOrder, [0,5]; + push @cOrder, [1,5]; + push @cOrder, [2,4]; + push @cOrder, [1,4]; + push @cOrder, [2,5]; + push @cOrder, [2,7]; + push @cOrder, [2,6]; + push @cOrder, [1,6]; + push @cOrder, [0,7]; + push @cOrder, [1,7]; + push @cOrder, [0,6]; + push @cOrder, [3,6]; + push @cOrder, [3,7]; + push @cOrder, [4,7]; + push @cOrder, [5,6]; + push @cOrder, [4,6]; + push @cOrder, [5,7]; + push @cOrder, [5,5]; + push @cOrder, [5,4]; + push @cOrder, [4,4]; + push @cOrder, [3,5]; + push @cOrder, [4,5]; + push @cOrder, [3,4]; + push @cOrder, [3,2]; + push @cOrder, [3,3]; + push @cOrder, [4,3]; + push @cOrder, [5,2]; + push @cOrder, [4,2]; + push @cOrder, [5,3]; + push @cOrder, [5,1]; + push @cOrder, [5,0]; + push @cOrder, [4,0]; + push @cOrder, [3,1]; + push @cOrder, [4,1]; + push @cOrder, [3,0]; + push @cOrder, [6,0]; + push @cOrder, [7,0]; + push @cOrder, [7,1]; + push @cOrder, [6,2]; + push @cOrder, [6,1]; + push @cOrder, [7,2]; + push @cOrder, [7,5]; + push @cOrder, [6,5]; + push @cOrder, [6,4]; + push @cOrder, [7,3]; + push @cOrder, [7,4]; + push @cOrder, [6,3]; + push @cOrder, [6,6]; + push @cOrder, [6,7]; + push @cOrder, [7,7]; + push @cOrder, [7,6]; + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c5"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx0, [readIs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx2, [readIs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx4, [readIs + 4x<%d*128 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx6, [readIs + 4x<%d*128 + 80>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy0, [readFs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy2, [readFs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy4, [readFs + 4x<%d*128 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy6, [readFs + 4x<%d*128 + 96>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $ctrl = "-:-:-:-:00"; + + if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) { + $ins = "-:G:D:-:00 NOP;\n"; + } + + if ($c > 60 && !$ins){ + $ins = "-:-:D:-:07 NOP;\n"; + } + + # 04 and 05 are dual issued + if($ins) { + $ctrl = "-:-:D:-:04"; + } else { + if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){ + $ctrl = "-:-:D:-:04"; + } + else{ + $ctrl = "-:-:D:-:05"; + } + } + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +-:-:-:-:00 LDS.128 mpq, [RZ + addr_m]; +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkI, SR_CTAID.Z; +-:-:-:-:00 S2R blkF, SR_CTAID.Y; + +// tidOX = (tid & 7) << 2 + (tid & 128) >> 1 +// tidOY = (tid & 127) >> 3 +-:-:-:-:00 LOP.AND tidOX, tid, 7; +-:-:-:-:00 SHL tidOX, tidOX, 2; +-:-:-:-:00 LOP.AND tidOX2, tid, 128; +-:-:-:-:00 SHR.U32 tidOX2, tidOX2, 1; +-:-:-:-:00 LOP.OR tidOX, tidOX, tidOX2; +-:-:-:-:00 LOP.AND tidOY, tid, 127; +-:-:-:-:00 SHR.U32 tidOY, tidOY, 3; + +-:-:-:-:00 SHL readFs, readFs, 1; +-:-:-:-:00 SHL readIs, readIs, 1; +-:-:-:-:00 LOP.AND readIs, readIs, 0x1ff; +-:-:-:-:00 LOP.AND readFs, readFs, 0x0ff; + +// Div by 4 here collapses k stride +// writeCs = (readKs / 4) * 128 + readNs; +-:-:-:-:00 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +-:-:-:-:00 ISCADD readCs, tidOY, tidOX, 7; +-:-:-:-:00 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +-:-:-:-:00 ISCADD n, blkI, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*128 + tidOY * 4 +-:-:-:-:00 SHL tidOY, tidOY, 2; +-:-:-:-:00 ISCADD k, blkF, tidOY, 7; + +// o = k*MPQN + m*PQN + p*QN + q*N + n +-:-:-:-:00 IMAD to, q, param_N, n; +-:-:-:-:00 IMAD.U32.U32 to, p, param_QN, to; +-:-:-:-:00 IMAD.U32.U32 to, m, param_PQN, to; +-:-:-:-:00 IMAD.U32.U32 to, k, param_MPQN, to; +//-:-:-:-:00 LEA Out0.CC, to, param_O[0], 2; +//-:-:-:-:00 LEA.HI.X Out1, to, param_O[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_O[0]; +-:-:-:-:00 MOV tmp_param1, param_O[1]; +-:-:-:-:00 SHL tmp_shl, to, 0x2; +-:-:-:-:00 IADD Out0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X Out1, RZ, tmp_param1; + +-:-:-:-:00 MOV MPQN, param_MPQN; +-:-:-:-:00 SHL MPQN1, MPQN, 2; +-:-:-:-:00 SHL MPQN4, MPQN, 4; +-:-:-:-:00 ISCADD MPQN60, MPQN, -MPQN4, 8; + +-:-:-:-:00 ISETP.LT.AND P0, PT, n, param_N, PT; // n + 0 < N +-:-:-:-:00 IADD n, n, 32; +-:-:-:-:00 ISETP.LT.AND P1, PT, n, param_N, PT; // n + 32 < N + +-:-:-:-:00 MOV alpha, param_alpha; + +-:-:-:-:00 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + if ($y == 4) + { + $out .= sprintf( + "-:-:-:-:00 IADD Out0.CC, Out0, MPQN60;\n" . + "-:-:-:-:00 IADD k, k, 60;\n" . + "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n" . + "-:-:-:-:00 IADD.X Out1, Out1, RZ;\n\n", + ($y) x 8); + } + else + { + $out .= sprintf( + "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n\n", + ($y) x 8); + } + + $out .= "-:-:-:-:00 CAL STORE_C;\n\n"; + } + return $out; + + + +-:-:-:-:00 EXIT; + +STORE_C: + +-:-:-:-:00 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n + 0 < N +-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n + 32 < N +-:-:-:-:00 IADD k, k, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +-:-:-:-:00 STS.128 [writeCs + 4x<00>], cs0; +-:-:-:-:00 STS.128 [writeCs + 4x<32>], cs4; +-:-:-:-:00 LDS.128 cs0, [readCs + 4x<00>]; +-:-:-:-:00 LDS.128 cs4, [readCs + 4x<32>]; + +// Store results back to global +-:-:-:-:00 @P2 ST.E.128 [Out + 4x<00>], cs0; +-:-:-:-:00 @P3 ST.E.128 [Out + 4x<32>], cs4; + +-:-:-:-:00 IADD Out0.CC, Out0, MPQN1; +-:-:-:-:00 IADD.X Out1, Out1, RZ; + +-:-:-:-:00 RET; + diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.cu b/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.cu new file mode 100644 index 0000000..fc3ff39 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.cu @@ -0,0 +1,52 @@ +extern "C" +__global__ void sconv_bprop_C1_N64 ( + float* param_test, + float* param_I, + const float* param_F, + const float* param_E, + float param_alpha, + int param_N, + int param_K, + int param_D, + int param_H, + int param_W, + int param_WN, + int param_HWN, + int param_DHWN, + int param_C, + int param_CRST, + int param_RST, + int param_magic_RST, + int param_shift_RST, + int param_RS, + int param_magic_RS, + int param_shift_RS, + int param_S, + int param_magic_S, + int param_shift_S, + int param_pad_d, + int param_pad_h, + int param_pad_w, + int param_str_d, + int param_str_h, + int param_str_w, + int param_Q, + int param_PQ, + int param_QN, + int param_PQN, + int param_MPQN, + int param_magic_Q, + int param_shift_Q, + int param_magic_PQ, + int param_shift_PQ, + int param_CRST8, + int param_MPQN8) { + __shared__ float shared[64 * 8 * 4 * 2]; + + int tid = threadIdx.x; + + shared[tid] = 1; + + *param_I = shared[31 - tid]; + *param_test = shared[31 - tid]; + } diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.sass b/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.sass new file mode 100644 index 0000000..ab26e12 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.sass @@ -0,0 +1,805 @@ +# Kernel: sconv_bprop_C1_N64 + +// debug: +// mode1 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +//-:-:-:-:00 SHL tmp_shl, tid, 0x2; +//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +//-:-:-:-:00 I2F.F32.U32 rst, rst; +//-:-:-:-:00 ST.E [tmp_param00], rst; +//-:-:-:-:00 EXIT; + +// mode2 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +// +//-:-:-:-:00 MOV32I k, 0x40000000; +//-:-:-:-:00 ST.E [tmp_param0], k; +//-:-:-:-:00 EXIT; + +// modify steps: +// IMAD->IMAD +// shared memory addresses->RZ +// LDG->LD +// LEA->MOV, IADD, SHL +// IMAD.LO2C->IMAD.U32.U32 +// IMAD.PSL->IMAD.U32.U32 +// VMAD->IMAD, IADD +// MOV->MOV32I +// IADD3->IADD, IADD +// POPC +// LOP3 +// ST.CG->ST +// control code +// comments +// LDS.U->LDS +// register<0-7>->register<0-3>, register<4-7> +// avoid register conflicts + +// optimization steps: +// alexnet2 +// initial->227 +// bank conflict->226 +// alignment+dual issue+reuse->245 +// half ldg.128->1700 +// all ldg.128->1777 +// control codes->1900 +// scheduling->1937 +// reduce unnecessary instructions->2100 + + + addr_zero : 4x<64*8*4 + 0> + addr_lut : 4x<64*8*4 + 8> + + param_test[0] : c[0x0][0x140] + param_test[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_F[0] : c[0x0][0x158] + param_F[1] : c[0x0][0x15c] + param_alpha : c[0x0][0x160] + param_N : c[0x0][0x164] + param_K : c[0x0][0x168] + param_D : c[0x0][0x16c] + param_H : c[0x0][0x170] + param_W : c[0x0][0x174] + param_WN : c[0x0][0x178] + param_HWN : c[0x0][0x17c] + param_DHWN : c[0x0][0x180] + param_C : c[0x0][0x184] + param_CRST : c[0x0][0x188] + param_RST : c[0x0][0x18c] + param_magic_RST : c[0x0][0x190] + param_shift_RST : c[0x0][0x194] + param_RS : c[0x0][0x198] + param_magic_RS : c[0x0][0x19c] + param_shift_RS : c[0x0][0x1a0] + param_S : c[0x0][0x1a4] + param_magic_S : c[0x0][0x1a8] + param_shift_S : c[0x0][0x1ac] + param_pad_d : c[0x0][0x1b0] + param_pad_h : c[0x0][0x1b4] + param_pad_w : c[0x0][0x1b8] + param_str_d : c[0x0][0x1bc] + param_str_h : c[0x0][0x1c0] + param_str_w : c[0x0][0x1c4] + param_Q : c[0x0][0x1c8] + param_PQ : c[0x0][0x1cc] + param_QN : c[0x0][0x1d0] + param_PQN : c[0x0][0x1d4] + param_MPQN : c[0x0][0x1d8] + param_magic_Q : c[0x0][0x1dc] + param_shift_Q : c[0x0][0x1e0] + param_magic_PQ : c[0x0][0x1e4] + param_shift_PQ : c[0x0][0x1e8] + param_CRST8 : c[0x0][0x1ec] + param_MPQN8 : c[0x0][0x1f0] + + + + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 ~ blkE, blkF, blkMPQ + + 68-95 ~ k<0|4>, tidX, tid1, pq, m, p, q, crst, crst1, crst2, crst3, n, n32, tf<0|4>, te, te<0|4> + + 64-67 : j0Fy<0-3> + 68-71 : j0Ex<0-3> + 72-75 : j0Fy<4-7> + 76-79 : j0Ex<4-7> + 80-83 : j1Fy<0-3> + 84-87 : j1Ex<0-3> + 88-91 : j1Fy<4-7> + 92-95 : j1Ex<4-7> + + 96-99 : load0F<0-3> + 100-103 : load4F<0-3> + 104-107 : load0E<0-3> + 108-111 : load0E<4-7> + 112-115 : load4E<0-3> + 116-119 : load4E<4-7> + + 120-123 : track0F<0-1>, track4F<0-1> + 124-127 : track0E<0-1>, track4E<0-1> + + 128-131 ~ writeEs, writeFs, swapBuf, K + 132-136 ~ readEs, readFs, mt, pr, qs + 137-142 : tmp_data, tmp_shl, tmp_param0, tmp_param1, p_and, tid + 144-145 : tmp_param0<0-1> + + 68-71 ~ lutStore, sliceI + 72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD + + 72-89 : c<0-7>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1> + 90-132 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31 + + + +-:-:-:-:00 S2R tid, SR_TID.X; // 0 : 1 : 31 +-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; // m, p, q +-:-:-:-:00 S2R blkF, SR_CTAID.Y; // crst +-:-:-:-:00 S2R blkE, SR_CTAID.Z; // N + +// [4][3][2][1][0] +// tidX = (tid & 7) << 2 +// tidX = 0 : 4 : 28 +// k0 = tid >> 3 +// k0 = 0 : 1 : 3 +// k4 = 4 : 1 : 7 +-:-:-:-:00 LOP.AND tidX, tid, 7; +-:-:-:-:00 SHL tidX, tidX, 2; +-:-:-:-:00 SHR.U32 k0, tid, 3; +-:-:-:-:00 IADD k4, k0, 4; + +-:-:-:-:00 MOV K, param_K; + +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15; + + +// m = blkMPQ / PQ +// pq = blkMPQ % PQ +-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ; +-:-:-:-:00 SHR.U32 m, m, param_shift_PQ; +-:-:-:-:00 IMAD pq, m, param_PQ, RZ; +-:-:-:-:00 IADD pq, -pq, blkMPQ; +// p = pq / Q +// q = pq % Q +-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ; +-:-:-:-:00 SHR.U32 p, p, param_shift_Q; +-:-:-:-:00 IMAD q, p, param_Q, RZ; +-:-:-:-:00 IADD q, -q, pq; + +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +-:-:-:-:00 IMAD mt, m, param_str_d, RZ; +-:-:-:-:00 IMAD pr, p, param_str_h, RZ; +-:-:-:-:00 IMAD qs, q, param_str_w, RZ; +-:-:-:-:00 IADD mt, mt, -param_pad_d; +-:-:-:-:00 IADD pr, pr, -param_pad_h; +-:-:-:-:00 IADD qs, qs, -param_pad_w; + +// crst = blkF * 32 + tidX +// n = blkE * 64 + tidX +-:-:-:-:00 ISCADD crst, blkF, tidX, 5; +-:-:-:-:00 IADD crst1, crst, 1; +-:-:-:-:00 IADD crst2, crst, 2; +-:-:-:-:00 IADD crst3, crst, 3; +-:-:-:-:00 ISCADD n, blkE, tidX, 6; +-:-:-:-:00 IADD n32, n, 32; + +// trackF = k * CRST + crst +// k0 = 0 : 1 : 3 +// k4 = 4 : 1 : 7 +// tf0 = k0 * CRST + crst +// tf4 = k4 * CRST + crst +-:-:-:-:00 IMAD tf0, k0, param_CRST, crst; +-:-:-:-:00 IMAD tf4, k4, param_CRST, crst; + +//-:-:-:-:00 LEA track0F0.CC, tf0, param_F[0], 2; +//-:-:-:-:00 LEA.HI.X track0F1, tf0, param_F[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_F[0]; +-:-:-:-:00 MOV tmp_param1, param_F[1]; +-:-:-:-:00 SHL tmp_shl, tf0, 0x2; +-:-:-:-:00 IADD track0F0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X track0F1, RZ, tmp_param1; + +//-:-:-:-:00 LEA track4F0.CC, tf4, param_F[0], 2; +//-:-:-:-:00 LEA.HI.X track4F1, tf4, param_F[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_F[0]; +-:-:-:-:00 MOV tmp_param1, param_F[1]; +-:-:-:-:00 SHL tmp_shl, tf4, 0x2; +-:-:-:-:00 IADD track4F0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X track4F1, RZ, tmp_param1; + +// trackE = k * MPQN + m * PQN + p * QN + q * N + n +-:-:-:-:00 IMAD te, q, param_N, n; +-:-:-:-:00 IMAD.U32.U32 te, p, param_QN, te; +-:-:-:-:00 IMAD.U32.U32 te, m, param_PQN, te; +-:-:-:-:00 IMAD.U32.U32 te0, k0, param_MPQN, te; +-:-:-:-:00 IMAD.U32.U32 te4, k4, param_MPQN, te; +//-:-:-:-:00 LEA track0E0.CC, te0, param_E[0], 2; +//-:-:-:-:00 LEA.HI.X track0E1, te0, param_E[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_E[0]; +-:-:-:-:00 MOV tmp_param1, param_E[1]; +-:-:-:-:00 SHL tmp_shl, te0, 0x2; +-:-:-:-:00 IADD track0E0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X track0E1, RZ, tmp_param1; +//-:-:-:-:00 LEA track4E0.CC, te4, param_E[0], 2; +//-:-:-:-:00 LEA.HI.X track4E1, te4, param_E[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_E[0]; +-:-:-:-:00 MOV tmp_param1, param_E[1]; +-:-:-:-:00 SHL tmp_shl, te4, 0x2; +-:-:-:-:00 IADD track4E0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X track4E1, RZ, tmp_param1; + +// P1 = crst < CRST +// P2 = n < N +// P3 = n + 32 < N +-:-:-:-:00 ISETP.LT.AND P2, PT, n, param_N, PT; +-:-:-:-:00 ISETP.LT.AND P3, PT, n32, param_N, PT; + +// writeFs = (32 * k + tidX) * 4 +// tidX = 0 : 4 : 28 +// k = 0 : 1 : 3 +// ------------- +// ------------- +// ------------- +// ------------- k * 32 +// ------ tidX +-:-:-:-:00 ISCADD writeFs, k0, tidX, 5; +-:-:-:-:00 SHL writeFs, writeFs, 2; +// writeEs = (64 * k + tidX) * 4 + 32 * 8 * 4 +// tidX = 0 : 4 : 28 +// k = 0 : 1 : 3 +// ------------- +// ------------- +// ------------- +// ------------- k * 64 +// ------ tidX +-:-:-:-:00 ISCADD writeEs, k0, tidX, 6; +-:-:-:-:00 ISCADD writeEs, writeEs, 4x<32*8>, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +// readFs = [4][0] * 4 +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 LOP.AND readFs, tid, -16; +-:-:-:-:00 SHR.U32 readFs, readFs, 3; +-:-:-:-:00 LOP.OR readFs, readFs, tid1; +-:-:-:-:00 SHL readFs, readFs, 4; + +// readEs = ((tid >> 1) & 7) << 4 +// readEs = [3][2][1] * 4 +-:-:-:-:00 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 ISCADD readEs, readEs, 4x<32*8>, 4; + +-:-:-:-:00 MOV32I swapBuf, 4x<32*8 + 64*8>; + +-:-:-:-:00 IADD K, K, -8; + +// CRST +// load0F0-load0F3 +-:-:-:-:00 ISETP.LT.AND P4, PT, crst, param_CRST, PT; +-:-:-:-:00 ISETP.LT.AND P5, PT, crst1, param_CRST, PT; +-:-:-:-:00 ISETP.LT.AND P6, PT, crst2, param_CRST, PT; +-:-:-:-:00 ISETP.LT.AND P1, PT, crst3, param_CRST, PT; + +-:-:-:-:00 @P4 LD.E.CI load0F0, [track0F + 4x<0>]; +-:-:-:-:00 @P5 LD.E.CI load0F1, [track0F + 4x<1>]; +-:-:-:-:00 @P6 LD.E.CI load0F2, [track0F + 4x<2>]; +-:-:-:-:00 @P1 LD.E.CI load0F3, [track0F + 4x<3>]; +-:-:-:-:00 @!P4 LDS.32 load0F0, [RZ + addr_zero]; +-:-:-:-:00 @!P5 LDS.32 load0F1, [RZ + addr_zero]; +-:-:-:-:00 @!P6 LDS.32 load0F2, [RZ + addr_zero]; +-:-:-:-:00 @!P1 LDS.32 load0F3, [RZ + addr_zero]; + +-:-:-:-:00 @P4 LD.E.CI load4F0, [track4F + 4x<0>]; +-:-:-:-:00 @P5 LD.E.CI load4F1, [track4F + 4x<1>]; +-:-:-:-:00 @P6 LD.E.CI load4F2, [track4F + 4x<2>]; +-:-:-:-:00 @P1 LD.E.CI load4F3, [track4F + 4x<3>]; +-:-:-:-:00 @!P4 LDS.32 load4F0, [RZ + addr_zero]; +-:-:-:-:00 @!P5 LDS.32 load4F1, [RZ + addr_zero]; +-:-:-:-:00 @!P6 LDS.32 load4F2, [RZ + addr_zero]; +-:-:-:-:00 @!P1 LDS.32 load4F3, [RZ + addr_zero]; + +// N +// load0E0-load0E3 +-:-:-:-:00 @P2 LD.E.128 load0E0, [track0E + 4x< 0>]; +-:-:-:-:00 @P3 LD.E.128 load0E4, [track0E + 4x<32>]; +-:-:-:-:00 @P2 LD.E.128 load4E0, [track4E + 4x< 0>]; +-:-:-:-:00 @P3 LD.E.128 load4E4, [track4E + 4x<32>]; + +-:-:-:-:00 ISETP.GT.AND P2, PT, K, RZ, P2; +-:-:-:-:00 ISETP.GT.AND P3, PT, K, RZ, P3; + +-:-:-:-:00 STS.128 [writeFs + 4x<0*32>], load0F; +-:-:-:-:00 IADD track0F0.CC, track0F0, param_CRST8; +-:-:-:-:00 IADD.X track0F1, track0F1, RZ; + +-:-:-:-:00 STS.128 [writeFs + 4x<4*32>], load4F; +-:-:-:-:00 IADD track4F0.CC, track4F0, param_CRST8; +-:-:-:-:00 IADD.X track4F1, track4F1, RZ; +// mode1 +// -:-:-:-:00 MOV tmp_param0, param_test[0]; +// -:-:-:-:00 MOV tmp_param1, param_test[1]; +// -:-:-:-:00 SHL tmp_shl, tid, 0x2; +// -:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +// -:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +// -:-:-:-:00 MOV tmp_data, param_CRST8; +// -:-:-:-:00 I2F.F32.U32 tmp_data, tmp_data; +// -:-:-:-:00 ST.E [tmp_param00], tmp_data; +// -:-:-:-:00 EXIT; + +-:-:-:-:00 STS.128 [writeEs + 4x<0*64 + 0>], load0E0; +-:-:-:-:00 STS.128 [writeEs + 4x<0*64 + 32>], load0E4; +-:-:-:-:00 IADD track0E0.CC, track0E0, param_MPQN8; +-:-:-:-:00 IADD.X track0E1, track0E1, RZ; + +-:-:-:-:00 STS.128 [writeEs + 4x<4*64 + 0>], load4E0; +-:-:-:-:00 STS.128 [writeEs + 4x<4*64 + 32>], load4E4; +-:-:-:-:00 IADD track4E0.CC, track4E0, param_MPQN8; +-:-:-:-:00 IADD.X track4E1, track4E1, RZ; + +-:-:-:-:00 IADD writeEs, writeEs, swapBuf; +-:-:-:-:00 IADD writeFs, writeFs, swapBuf; +-:-:-:-:00 IADD swapBuf, RZ, -swapBuf; + +-:-:-:-:00 IADD K, K, -8; + +-:-:-:-:00 LDS.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +-:-:-:-:00 LDS.128 j0Fy0, [readFs + 4x<0*32 + 00>]; +-:-:-:-:00 LDS.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +-:-:-:-:00 LDS.128 j0Fy4, [readFs + 4x<0*32 + 16>]; + +-:-:-:-:00 @P4 LD.E.CI load0F0, [track0F + 4x<0>]; +-:-:-:-:00 @P5 LD.E.CI load0F1, [track0F + 4x<1>]; +-:-:-:-:00 @P6 LD.E.CI load0F2, [track0F + 4x<2>]; +-:-:-:-:00 @P1 LD.E.CI load0F3, [track0F + 4x<3>]; +-:-:-:-:00 @!P4 LDS.32 load0F0, [RZ + addr_zero]; +-:-:-:-:00 @!P5 LDS.32 load0F1, [RZ + addr_zero]; +-:-:-:-:00 @!P6 LDS.32 load0F2, [RZ + addr_zero]; +-:-:-:-:00 @!P1 LDS.32 load0F3, [RZ + addr_zero]; + +-:-:-:-:00 @P4 LD.E.CI load4F0, [track4F + 4x<0>]; +-:-:-:-:00 @P5 LD.E.CI load4F1, [track4F + 4x<1>]; +-:-:-:-:00 @P6 LD.E.CI load4F2, [track4F + 4x<2>]; +-:-:-:-:00 @P1 LD.E.CI load4F3, [track4F + 4x<3>]; +-:-:-:-:00 @!P4 LDS.32 load4F0, [RZ + addr_zero]; +-:-:-:-:00 @!P5 LDS.32 load4F1, [RZ + addr_zero]; +-:-:-:-:00 @!P6 LDS.32 load4F2, [RZ + addr_zero]; +-:-:-:-:00 @!P1 LDS.32 load4F3, [RZ + addr_zero]; + +-:-:-:-:00 @P2 LD.E.128 load0E0, [track0E + 4x< 0>]; +-:-:-:-:00 @P3 LD.E.128 load0E4, [track0E + 4x<32>]; +-:-:-:-:00 @P2 LD.E.128 load4E0, [track4E + 4x< 0>]; +-:-:-:-:00 @P3 LD.E.128 load4E4, [track4E + 4x<32>]; + +-:-:-:-:00 ISETP.GT.AND P2, PT, K, RZ, P2; +-:-:-:-:00 ISETP.GT.AND P3, PT, K, RZ, P3; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + +NEXT_8K: +-:-:-:-:00 ISETP.GT.AND P0, PT, K, -8, PT; + + my %insert = + ( + j0c47 => "-:-:-:-:00 IADD K, K, -8;\n", + j0c53 => "-:-:-:-:00 \@P0 STS.128 [writeFs + 4x<0*32>], load0F;\n", + j0c61 => "-:-:-:-:00 \@P0 IADD track0F0.CC, track0F0, param_CRST8;\n", + j0c62 => "-:-:-:-:00 \@P0 IADD.X track0F1, track0F1, RZ;\n", + j0c63 => "-:-:-:-:00 \@P4 LD.E.CI load0F0, [track0F + 4x<0>];\n", + + j1c47 => "-:-:-:-:00 \@P5 LD.E.CI load0F1, [track0F + 4x<1>];\n", + j1c53 => "-:-:-:-:00 \@P6 LD.E.CI load0F2, [track0F + 4x<2>];\n", + j1c61 => "-:-:-:-:00 \@P1 LD.E.CI load0F3, [track0F + 4x<3>];\n", + j1c62 => "-:-:-:-:00 \@!P4 LDS.32 load0F0, [RZ + addr_zero];\n", + j1c63 => "-:-:-:-:00 \@!P5 LDS.32 load0F1, [RZ + addr_zero];\n", + + j2c47 => "-:-:-:-:00 \@!P6 LDS.32 load0F2, [RZ + addr_zero];\n", + j2c53 => "-:-:-:-:00 \@!P1 LDS.32 load0F3, [RZ + addr_zero];\n", + j2c61 => "-:-:-:-:00 \@P0 STS.128 [writeFs + 4x<4*32>], load4F;\n", + j2c62 => "-:-:-:-:00 \@P0 IADD track4F0.CC, track4F0, param_CRST8;\n", + j2c63 => "-:-:-:-:00 \@P0 IADD.X track4F1, track4F1, RZ;\n", + + j3c47 => "-:-:-:-:00 \@P4 LD.E.CI load4F0, [track4F + 4x<0>];\n", + j3c53 => "-:-:-:-:00 \@P5 LD.E.CI load4F1, [track4F + 4x<1>];\n", + j3c61 => "-:-:-:-:00 \@P6 LD.E.CI load4F2, [track4F + 4x<2>];\n", + j3c62 => "-:-:-:-:00 \@P1 LD.E.CI load4F3, [track4F + 4x<3>];\n", + j3c63 => "-:-:-:-:00 \@!P4 LDS.32 load4F0, [RZ + addr_zero];\n", + + j4c47 => "-:-:-:-:00 \@!P5 LDS.32 load4F1, [RZ + addr_zero];\n", + j4c53 => "-:-:-:-:00 \@!P6 LDS.32 load4F2, [RZ + addr_zero];\n", + j4c61 => "-:-:-:-:00 \@!P1 LDS.32 load4F3, [RZ + addr_zero];\n", + j4c62 => "-:-:-:-:00 \@P0 STS.128 [writeEs + 4x<0*64 + 0>], load0E0;\n", + j4c63 => "-:-:-:-:00 \@P0 STS.128 [writeEs + 4x<0*64 + 32>], load0E4;\n", + + j5c47 => "-:-:-:-:00 \@P0 IADD track0E0.CC, track0E0, param_MPQN8;\n", + j5c53 => "-:-:-:-:00 \@P0 IADD.X track0E1, track0E1, RZ;\n", + j5c61 => "-:-:-:-:00 \@P2 LD.E.128 load0E0, [track0E + 4x< 0>];\n", + j5c62 => "-:-:-:-:00 \@P3 LD.E.128 load0E4, [track0E + 4x<32>];\n", + j5c63 => "-:-:-:-:00 \@P0 STS.128 [writeEs + 4x<4*64 + 0>], load4E0;\n", + + j6c47 => "-:-:-:-:00 \@P0 STS.128 [writeEs + 4x<4*64 + 32>], load4E4;\n", + j6c53 => "-:-:-:-:00 \@P0 IADD track4E0.CC, track4E0, param_MPQN8;\n", + j6c61 => "-:-:-:-:00 \@P0 IADD.X track4E1, track4E1, RZ;\n", + j6c62 => "-:-:-:-:00 \@P2 LD.E.128 load4E0, [track4E + 4x< 0>];\n", + j6c63 => "-:-:-:-:00 \@P3 LD.E.128 load4E4, [track4E + 4x<32>];\n". + "-:-:-:-:00 \@P0 IADD readEs, readEs, -swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD writeEs, writeEs, swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c47 => "-:-:-:-:00 ISETP.GT.AND P4, PT, K, RZ, P4;\n", + j7c53 => "-:-:-:-:00 ISETP.GT.AND P5, PT, K, RZ, P5;\n", + j7c61 => "-:-:-:-:00 ISETP.GT.AND P6, PT, K, RZ, P6;\n", + j7c62 => "-:-:-:-:00 ISETP.GT.AND P1, PT, K, RZ, P1;\n", + j7c63 => "-:-:-:-:00 ISETP.GT.AND P2, PT, K, RZ, PT;\n". + "-:-:-:-:00 ISETP.GT.AND P3, PT, K, RZ, PT;\n". + "-:-:-:-:00 \@P0 BRA.U NEXT_8K;\n", + ); + + my @cOrder; + + push @cOrder, [0,0]; + push @cOrder, [0,1]; + push @cOrder, [1,1]; + push @cOrder, [2,0]; + push @cOrder, [1,0]; + push @cOrder, [2,1]; + push @cOrder, [2,3]; + push @cOrder, [2,2]; + push @cOrder, [1,2]; + push @cOrder, [0,3]; + push @cOrder, [1,3]; + push @cOrder, [0,2]; + push @cOrder, [0,4]; + push @cOrder, [0,5]; + push @cOrder, [1,5]; + push @cOrder, [2,4]; + push @cOrder, [1,4]; + push @cOrder, [2,5]; + push @cOrder, [2,7]; + push @cOrder, [2,6]; + push @cOrder, [1,6]; + push @cOrder, [0,7]; + push @cOrder, [1,7]; + push @cOrder, [0,6]; + push @cOrder, [3,6]; + push @cOrder, [3,7]; + push @cOrder, [4,7]; + push @cOrder, [5,6]; + push @cOrder, [4,6]; + push @cOrder, [5,7]; + push @cOrder, [5,5]; + push @cOrder, [5,4]; + push @cOrder, [4,4]; + push @cOrder, [3,5]; + push @cOrder, [4,5]; + push @cOrder, [3,4]; + push @cOrder, [3,2]; + push @cOrder, [3,3]; + push @cOrder, [4,3]; + push @cOrder, [5,2]; + push @cOrder, [4,2]; + push @cOrder, [5,3]; + push @cOrder, [5,1]; + push @cOrder, [5,0]; + push @cOrder, [4,0]; + push @cOrder, [3,1]; + push @cOrder, [4,1]; + push @cOrder, [3,0]; + push @cOrder, [6,0]; + push @cOrder, [7,0]; + push @cOrder, [7,1]; + push @cOrder, [6,2]; + push @cOrder, [6,1]; + push @cOrder, [7,2]; + push @cOrder, [7,5]; + push @cOrder, [6,5]; + push @cOrder, [6,4]; + push @cOrder, [7,3]; + push @cOrder, [7,4]; + push @cOrder, [6,3]; + push @cOrder, [6,6]; + push @cOrder, [6,7]; + push @cOrder, [7,7]; + push @cOrder, [7,6]; + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c5"} = sprintf "-:G:D:-:01 %s LDS.64 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dEx2, [readEs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dEx6, [readEs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy2, [readFs + 4x<%d*32 + 2>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy6, [readFs + 4x<%d*32 + 18>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $ctrl = "-:-:-:-:00"; + + if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) { + $ins = "-:G:D:-:00 NOP;\n"; + } + + if ($c > 60 && !$ins){ + $ins = "-:-:D:-:07 NOP;\n"; + } + + # 04 and 05 are dual issued + if($ins) { + $ctrl = "-:-:D:-:04"; + } else { + if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){ + $ctrl = "-:-:D:-:04"; + } + else{ + $ctrl = "-:-:D:-:05"; + } + } + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +-:-:-:-:00 MOV32I warp_cnt, 32; +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkF, SR_CTAID.Y; +-:-:-:-:00 S2R blkE, SR_CTAID.Z; +-:-:-:-:00 MOV rst, tid; + +LUT_LOOP: + +// warp synchronous loop while warp_cnt < RST (c=0) +-:-:-:-:00 ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT; +-:-:-:-:00 IADD warp_cnt, warp_cnt, 32; +// t = rst / RS +// rs = rst % RS +-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ; +-:-:-:-:00 SHR.U32 t, t, param_shift_RS; +-:-:-:-:00 IMAD rs, t, param_RS, RZ; +-:-:-:-:00 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ; +-:-:-:-:00 SHR.U32 r, r, param_shift_S; +-:-:-:-:00 IMAD s, r, param_S, RZ; +-:-:-:-:00 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +-:-:-:-:00 IADD z, mt, t; +-:-:-:-:00 IADD y, pr, r; +-:-:-:-:00 IADD x, qs, s; +// i = (z*HWN + y*WN + x*N) * 4 +-:-:-:-:00 IMAD.U32.U32 sliceI, z, param_HWN, RZ; +-:-:-:-:00 IMAD.U32.U32 sliceI, y, param_WN, sliceI; +-:-:-:-:00 IMAD sliceI, x, param_N, sliceI; +-:-:-:-:00 SHL sliceI, sliceI, 2; +// Bounds check x and y, and make i negative if outside +-:-:-:-:00 ISET.LT.AND x0, x, RZ, PT; +-:-:-:-:00 ISET.GE.AND xW, x, param_W, PT; +-:-:-:-:00 ISET.LT.AND y0, y, RZ, PT; +-:-:-:-:00 ISET.GE.AND yH, y, param_H, PT; +-:-:-:-:00 ISET.LT.AND z0, z, RZ, PT; +-:-:-:-:00 ISET.GE.AND zD, z, param_D, PT; +// if x0 || xW || y0 || yH || z0 || zD then sliceI = -1 +//-:-:-:-:00 LOP3.LUT sliceI, sliceI, x0, xW, 0xfe; +//-:-:-:-:00 LOP3.LUT sliceI, sliceI, y0, yH, 0xfe; +//-:-:-:-:00 LOP3.LUT sliceI, sliceI, z0, zD, 0xfe; +-:-:-:-:00 LOP.OR tmp_data, x0, xW; +-:-:-:-:00 LOP.OR tmp_data, tmp_data, y0; +-:-:-:-:00 LOP.OR tmp_data, tmp_data, yH; +-:-:-:-:00 LOP.OR tmp_data, tmp_data, z0; +-:-:-:-:00 LOP.OR tmp_data, tmp_data, zD; +-:-:-:-:00 LOP.OR sliceI, tmp_data, sliceI; + +-:-:-:-:00 SHL lutStore, rst, 2; +-:-:-:-:00 IADD rst, rst, 32; +// Store i imgOffset into the shared lookup table +-:-:-:-:00 STS [lutStore + addr_lut], sliceI; + +-:-:-:-:00 @P0 BRA.U LUT_LOOP; + +-:-:-:-:00 MOV RST, param_RST; +-:-:-:-:00 MOV DHWN1, param_DHWN; +-:-:-:-:00 SHL DHWN1, DHWN1, 2; + +-:-:-:-:00 LOP.AND readEs, readEs, 0x7f; +-:-:-:-:00 LOP.AND readFs, readFs, 0x3f; + +// writeCs = ((readIs / 4) * 64 + readEs); +-:-:-:-:00 ISCADD writeCs, readFs, readEs, 4; + +// readCs = (tid & 31) << 2; +-:-:-:-:00 LOP.AND tid31, tid, 31; +-:-:-:-:00 SHL readCs, tid31, 2; + +// nn = blkE * 64 + tid31; +-:-:-:-:00 ISCADD nn, blkE, tid31, 6; + +// crst = blkF * 32 +-:-:-:-:00 SHL crst00, blkF, 5; +-:-:-:-:00 IADD crst04, crst00, 4; +-:-:-:-:00 IADD crst08, crst00, 8; +-:-:-:-:00 IADD crst12, crst00, 12; + +// -:-:-:-:00 LEA trackI0.CC, nn, param_I[0], 2; +// -:-:-:-:00 LEA.HI.X trackI1, nn, param_I[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_I[0]; +-:-:-:-:00 MOV tmp_param1, param_I[1]; +-:-:-:-:00 SHL tmp_shl, nn, 0x2; +-:-:-:-:00 IADD trackI0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackI1, RZ, tmp_param1; + +// n < N +-:-:-:-:00 ISETP.LT.AND P5, PT, nn, param_N, PT; +-:-:-:-:00 IADD nn, nn, 32; +-:-:-:-:00 ISETP.LT.AND P6, PT, nn, param_N, PT; + +-:-:-:-:00 MOV alpha, param_alpha; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "-:-:-:-:00 IADD crst00, crst00, 12;\n" . + "-:-:-:-:00 IADD crst04, crst04, 12;\n" . + "-:-:-:-:00 IADD crst08, crst08, 12;\n" . + "-:-:-:-:00 IADD crst12, crst12, 12;\n" if $y == 4; + + $out .= sprintf( + "-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL c4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL c5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL c6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "-:-:-:-:00 CAL STORE_C;\n\n"; + } + return $out; + + + +-:-:-:-:00 EXIT; + +STORE_C: + +// Warp shuffle to drop the awkward readAs/readBs mapping +-:-:-:-:00 STS.128 [writeCs+4x<00>], c0; +-:-:-:-:00 STS.128 [writeCs+4x<32>], c4; + +-:-:-:-:00 LDS c0, [readCs + 4x<0*64 + 00>]; +-:-:-:-:00 LDS c1, [readCs + 4x<0*64 + 32>]; +-:-:-:-:00 LDS c2, [readCs + 4x<1*64 + 00>]; +-:-:-:-:00 LDS c3, [readCs + 4x<1*64 + 32>]; +-:-:-:-:00 LDS c4, [readCs + 4x<2*64 + 00>]; +-:-:-:-:00 LDS c5, [readCs + 4x<2*64 + 32>]; +-:-:-:-:00 LDS c6, [readCs + 4x<3*64 + 00>]; +-:-:-:-:00 LDS c7, [readCs + 4x<3*64 + 32>]; + +-:-:-:-:00 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; +-:-:-:-:00 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; +-:-:-:-:00 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; + +// c00 = crst00 / RST +// lut00 = crst00 % RST +-:-:-:-:00 IMAD.U32.U32 c00, crst00, param_magic_RST, RZ; +-:-:-:-:00 IMAD.U32.U32 c04, crst04, param_magic_RST, RZ; +-:-:-:-:00 IMAD.U32.U32 c08, crst08, param_magic_RST, RZ; +-:-:-:-:00 IMAD.U32.U32 c12, crst12, param_magic_RST, RZ; + +-:-:-:-:00 SHR.U32 c00, c00, param_shift_RST; +-:-:-:-:00 SHR.U32 c04, c04, param_shift_RST; +-:-:-:-:00 SHR.U32 c08, c08, param_shift_RST; +-:-:-:-:00 SHR.U32 c12, c12, param_shift_RST; + +//-:-:-:-:00 VMAD.U16.U16 lut00, -c00, RST, crst00; +-:-:-:-:00 IMAD lut00, -c00, RST, RZ; +-:-:-:-:00 IADD lut00, lut00, crst00; +//-:-:-:-:00 VMAD.U16.U16 lut04, -c04, RST, crst04; +-:-:-:-:00 IMAD lut04, -c04, RST, RZ; +-:-:-:-:00 IADD lut04, lut04, crst04; +//-:-:-:-:00 VMAD.U16.U16 lut08, -c08, RST, crst08; +-:-:-:-:00 IMAD lut08, -c08, RST, RZ; +-:-:-:-:00 IADD lut08, lut08, crst08; +//-:-:-:-:00 VMAD.U16.U16 lut12, -c12, RST, crst12; +-:-:-:-:00 IMAD lut12, -c12, RST, RZ; +-:-:-:-:00 IADD lut12, lut12, crst12; + +-:-:-:-:00 SHL lut00, lut00, 2; +-:-:-:-:00 SHL lut04, lut04, 2; +-:-:-:-:00 SHL lut08, lut08, 2; +-:-:-:-:00 SHL lut12, lut12, 2; + +-:-:-:-:00 IMAD.U32.U32 chan00, DHWN1, c00, RZ; +-:-:-:-:00 IMAD.U32.U32 chan04, DHWN1, c04, RZ; +-:-:-:-:00 IMAD.U32.U32 chan08, DHWN1, c08, RZ; +-:-:-:-:00 IMAD.U32.U32 chan12, DHWN1, c12, RZ; + +-:-:-:-:00 IADD crst00, crst00, 1; +-:-:-:-:00 IADD crst04, crst04, 1; +-:-:-:-:00 IADD crst08, crst08, 1; +-:-:-:-:00 IADD crst12, crst12, 1; + +-:-:-:-:00 @P0 LDS img00, [lut00 + addr_lut]; +-:-:-:-:00 @P1 LDS img04, [lut04 + addr_lut]; +-:-:-:-:00 @P2 LDS img08, [lut08 + addr_lut]; +-:-:-:-:00 @P3 LDS img12, [lut12 + addr_lut]; + +-:-:-:-:00 ISETP.GE.AND P0, PT, img00, RZ, P0; +-:-:-:-:00 IADD tmp_data, img00, chan00; +-:-:-:-:00 IADD track00I0.CC, trackI0, tmp_data; +-:-:-:-:00 IADD.X track00I1, trackI1, RZ; + +-:-:-:-:00 ISETP.GE.AND P1, PT, img04, RZ, P1; +-:-:-:-:00 IADD tmp_data, img04, chan04; +-:-:-:-:00 IADD track04I0.CC, trackI0, tmp_data; +-:-:-:-:00 IADD.X track04I1, trackI1, RZ; + +-:-:-:-:00 ISETP.GE.AND P2, PT, img08, RZ, P2; +-:-:-:-:00 IADD tmp_data, img08, chan08; +-:-:-:-:00 IADD track08I0.CC, trackI0, tmp_data; +-:-:-:-:00 IADD.X track08I1, trackI1, RZ; + +-:-:-:-:00 ISETP.GE.AND P3, PT, img12, RZ, P3; +-:-:-:-:00 IADD tmp_data, img12, chan12; +-:-:-:-:00 IADD track12I0.CC, trackI0, tmp_data; +-:-:-:-:00 IADD.X track12I1, trackI1, RZ; + +-:-:-:-:00 @P0 RED.E.ADD.F32.FTZ.RN [track00I], c0; +-:-:-:-:00 PSETP.AND.AND P0, PT, P0, P6, PT; +-:-:-:-:00 @P1 RED.E.ADD.F32.FTZ.RN [track04I], c2; +-:-:-:-:00 PSETP.AND.AND P1, PT, P1, P6, PT; +-:-:-:-:00 @P2 RED.E.ADD.F32.FTZ.RN [track08I], c4; +-:-:-:-:00 PSETP.AND.AND P2, PT, P2, P6, PT; +-:-:-:-:00 @P3 RED.E.ADD.F32.FTZ.RN [track12I], c6; +-:-:-:-:00 PSETP.AND.AND P3, PT, P3, P6, PT; + +-:-:-:-:00 @P0 RED.E.ADD.F32.FTZ.RN [track00I + 4x<32>], c1; +-:-:-:-:00 @P1 RED.E.ADD.F32.FTZ.RN [track04I + 4x<32>], c3; +-:-:-:-:00 @P2 RED.E.ADD.F32.FTZ.RN [track08I + 4x<32>], c5; +-:-:-:-:00 @P3 RED.E.ADD.F32.FTZ.RN [track12I + 4x<32>], c7; + +-:-:-:-:00 RET; + diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.cu b/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.cu new file mode 100644 index 0000000..92ce953 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.cu @@ -0,0 +1,56 @@ +extern "C" +__global__ void sconv_bprop_C64_N64 ( + float* param_test, + float* param_O, + const float* param_I, + const float* param_F, + float param_alpha, + int param_N, + int param_K, + int param_D, + int param_H, + int param_W, + int param_WN, + int param_HWN, + int param_DHWN, + int param_C, + int param_CRST, + int param_RST, + int param_RS, + int param_magic_RS, + int param_shift_RS, + int param_S, + int param_magic_S, + int param_shift_S, + int param_pad_d, + int param_pad_h, + int param_pad_w, + int param_str_d, + int param_str_h, + int param_str_w, + int param_Q, + int param_PQ, + int param_QN, + int param_PQN, + int param_MPQN, + int param_magic_Q, + int param_shift_Q, + int param_magic_PQ, + int param_shift_PQ, + int param_R, + int param_T, + int param_magic_str_w, + int param_shift_str_w, + int param_magic_str_h, + int param_shift_str_h, + int param_magic_str_d, + int param_shift_str_d) { + __shared__ float share[64 * 8 * 4 + 8]; + + int tid = threadIdx.x; + + share[tid] = 1; + + *param_O = share[63-tid]; + *param_test = share[63-tid]; + } diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.sass b/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.sass new file mode 100644 index 0000000..e6ddb9e --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.sass @@ -0,0 +1,783 @@ +# Kernel: sconv_bprop_C64_N64 +// debug: +// mode1 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +//-:-:-:-:00 SHL tmp_shl, tid, 0x2; +//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +//-:-:-:-:00 I2F.F32.U32 rst, rst; +//-:-:-:-:00 ST.E [tmp_param00], rst; +//-:-:-:-:00 EXIT; + +// mode2 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +// +//-:-:-:-:00 MOV32I k, 0x40000000; +//-:-:-:-:00 ST.E [tmp_param0], k; +//-:-:-:-:00 EXIT; + +// modify steps: +// XMAD->IMAD +// shared memory addresses->RZ +// LDG->LD +// LEA->MOV, IADD, SHL +// XMAD.LO2C->IMAD.U32.U32 +// XMAD.PSL->IMAD.U32.U32 +// VMAD->IMAD, IADD +// MOV->MOV32I +// IADD3->IADD, IADD +// POPC +// ST.CG->ST +// control code +// comments +// LDS.U->LDS +// register<0-7>->register<0-3>, register<4-7> +// avoid register conflicts +// tid->other register + +// optimization steps: +// alexnet2 +// initial->1200 +// bank conflict->1300 +// alignment+dual issue+reuse->1700 +// all ldg.128->1900 +// control codes->2000 +// reduce unnecessary instructions->2100 +// scheduling->1937 + + + addr_zero : 4x<64*8*4 + 0> + addr_m : 4x<64*8*4 + 4> + addr_p : 4x<64*8*4 + 5> + addr_q : 4x<64*8*4 + 6> + addr_szLut : 4x<64*8*4 + 7> + addr_lut : 4x<64*8*4 + 8> + + param_test[0] : c[0x0][0x140] + param_test[1] : c[0x0][0x144] + param_O[0] : c[0x0][0x148] + param_O[1] : c[0x0][0x14c] + param_I[0] : c[0x0][0x150] + param_I[1] : c[0x0][0x154] + param_F[0] : c[0x0][0x158] + param_F[1] : c[0x0][0x15c] + param_alpha : c[0x0][0x160] + param_N : c[0x0][0x164] + param_K : c[0x0][0x168] + param_D : c[0x0][0x16c] + param_H : c[0x0][0x170] + param_W : c[0x0][0x174] + param_WN : c[0x0][0x178] + param_HWN : c[0x0][0x17c] + param_DHWN : c[0x0][0x180] + param_C : c[0x0][0x184] + param_KRST : c[0x0][0x188] + param_RST : c[0x0][0x18c] + param_RS : c[0x0][0x190] + param_magic_RS : c[0x0][0x194] + param_shift_RS : c[0x0][0x198] + param_S : c[0x0][0x19c] + param_magic_S : c[0x0][0x1a0] + param_shift_S : c[0x0][0x1a4] + param_pad_d : c[0x0][0x1a8] + param_pad_h : c[0x0][0x1ac] + param_pad_w : c[0x0][0x1b0] + param_str_d : c[0x0][0x1b4] + param_str_h : c[0x0][0x1b8] + param_str_w : c[0x0][0x1bc] + param_Q : c[0x0][0x1c0] + param_PQ : c[0x0][0x1c4] + param_QN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_MPQN : c[0x0][0x1d0] + param_magic_Q : c[0x0][0x1d4] + param_shift_Q : c[0x0][0x1d8] + param_magic_PQ : c[0x0][0x1dc] + param_shift_PQ : c[0x0][0x1e0] + param_R : c[0x0][0x1e4] + param_T : c[0x0][0x1e8] + param_magic_str_w : c[0x0][0x1ec] + param_shift_str_w : c[0x0][0x1f0] + param_magic_str_h : c[0x0][0x1f4] + param_shift_str_h : c[0x0][0x1f8] + param_magic_str_d : c[0x0][0x1fc] + param_shift_str_d : c[0x0][0x200] + + + + + 64-67 : mpq<0-3> + 64-67 : m, p, q, tidY + 68-70 : blkF, blkI, blkMPQ + 72-95 ~ tid1, tidX + 72-107 ~ str_d, str_h, str_w, pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, one, rst_prime, x_prime, y_prime, z_prime, ballot, warp_slices, partial, endCRST + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 : j0Fy<0-3> + 68-71 : j0Ix<0-3> + 72-75 : j0Fy<4-7> + 76-79 : j0Ix<4-7> + 80-83 : j1Fy<0-3> + 84-87 : j1Ix<0-3> + 88-91 : j1Fy<4-7> + 92-95 : j1Ix<4-7> + + 136-139 : offsetF, offsetIc, offsetFc + 140-141 : sliceI, sliceF + 140-141 : sliceIF<0-1> + 142-145 : addressF<0-1>, addressI<0-1> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-103 : loadI<0-3> + 104-107 : loadF<0-3> + 108-111 : loadI<4-7> + 112-115 : loadF<4-7> + + 116-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI + 126-127 : readFs, readIs + 128-131 : tmp_data, tmp_shl, p_and, tid + 132-133 : tmp_param<0-1> + + 72-79 : cs<0-7> + 80-81 : Out<0-1> + 82-125 ~ writeCs, readCs, alpha, tidOX, tidOY, to, k, n, MPQN1, MPQN28, MPQN, MPQN4 + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkF, SR_CTAID.Y; +-:-:-:-:00 S2R blkI, SR_CTAID.Z; +-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index + +-:-:-:-:00 ISETP.GE.AND P0, PT, tid, 32, PT; + +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + + return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15; + + +// tidX = (tid & 7) << 2 +// tidY = tid >> 3 +-:-:-:-:00 LOP.AND tidX, tid, 7; +-:-:-:-:00 SHL tidX, tidX, 2; +-:-:-:-:00 SHR.U32 tidY, tid, 3; + +// trackF += blkF*64 + tidX +-:-:-:-:00 ISCADD offsetFk, blkF, tidX, 6; + +// trackI += blkI*64 + tidX +-:-:-:-:00 ISCADD offsetIn, blkI, tidX, 6; + +// writeS = (64*tidY + tidX) * 4 +-:-:-:-:00 ISCADD writeS, tidY, tidX, 6; +-:-:-:-:00 SHL writeS, writeS, 2; + +// readFs = (((tid & 0x30) >> 3) | (tid & 1)) << 4; +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 LOP.AND readFs, tid, 0x30; +-:-:-:-:00 SHR.U32 readFs, readFs, 3; +-:-:-:-:00 LOP.OR readFs, readFs, tid1; +-:-:-:-:00 SHL readFs, readFs, 4; + +// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +-:-:-:-:00 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 ISCADD readIs, readIs, 4x<8*64>, 4; + +-:-:-:-:00 @P0 BRA.U END_SETUP; + +-:-:-:-:00 MOV str_d, param_str_d; +-:-:-:-:00 MOV str_h, param_str_h; +-:-:-:-:00 MOV str_w, param_str_w; +-:-:-:-:00 MOV rst, tid; +-:-:-:-:00 MOV lutStore2, RZ; +-:-:-:-:00 MOV lutSize, RZ; +-:-:-:-:00 MOV32I warp_count, 32; + +// m = blkMPQ / PQ +// pq = blkMPQ % PQ +-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ; +-:-:-:-:00 SHR.U32 m, m, param_shift_PQ; +-:-:-:-:00 IMAD pq, m, param_PQ, RZ; +-:-:-:-:00 IADD pq, -pq, blkMPQ; +// p = pq / Q +// q = pq % Q +-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ; +-:-:-:-:00 SHR.U32 p, p, param_shift_Q; +-:-:-:-:00 IMAD q, p, param_Q, RZ; +-:-:-:-:00 IADD q, -q, pq; + +-:-:-:-:00 MOV32I dep_thd_mask, -1; + +-:-:-:-:00 LOP.AND p_and, p, 1; +-:-:-:-:00 ISETP.NE.AND P1, PT, p_and, RZ, PT; +-:-:-:-:00 @P1 IADD q, -q, param_Q; +-:-:-:-:00 @P1 IADD q, q, dep_thd_mask; + +-:-:-:-:00 STS.128 [RZ + addr_m], m; + +// qs = q - S + pad_w + 1 +-:-:-:-:00 MOV32I one, 1; +-:-:-:-:00 IADD qs, q, -param_S; +-:-:-:-:00 IADD qs, qs, param_pad_w; +-:-:-:-:00 IADD qs, qs, one; + +// pr = p - R + pad_h + 1 +-:-:-:-:00 IADD pr, p, -param_R; +-:-:-:-:00 IADD pr, pr, param_pad_h; +-:-:-:-:00 IADD pr, pr, one; + +// mt = m - T + pad_d + 1 +-:-:-:-:00 IADD mt, m, -param_T; +-:-:-:-:00 IADD mt, mt, param_pad_d; +-:-:-:-:00 IADD mt, mt, one; + +-:-:-:-:00 IADD mask_shr, -tid, 32; +-:-:-:-:00 SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr; + +LUT_LOOP: + +// warp synchronous loop while warp_count < RST +-:-:-:-:00 ISETP.LT.AND P0, PT, warp_count, param_RST, PT; +-:-:-:-:00 IADD warp_count, warp_count, 32; +// t = rst / RS +// rs = rst % RS +-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ; +-:-:-:-:00 SHR.U32 t, t, param_shift_RS; +-:-:-:-:00 IMAD rs, t, param_RS, RZ; +-:-:-:-:00 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ; +-:-:-:-:00 SHR.U32 r, r, param_shift_S; +-:-:-:-:00 IMAD s, r, param_S, RZ; +-:-:-:-:00 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +-:-:-:-:00 IADD x, qs, s; +-:-:-:-:00 IADD y, pr, r; +-:-:-:-:00 IADD z, mt, t; +-:-:-:-:00 ISETP.GE.AND P4, PT, x, RZ, PT; +-:-:-:-:00 ISETP.GE.AND P5, PT, y, RZ, PT; +-:-:-:-:00 ISETP.GE.AND P6, PT, z, RZ, PT; +// rst_prime = t*RS + r*S + s +// s = S - s - 1 +-:-:-:-:00 IADD s, -s, param_S; +-:-:-:-:00 IADD s, s, -one; +// r = R - r - 1 +-:-:-:-:00 IADD r, -r, param_R; +-:-:-:-:00 IADD r, r, -one; +// t = T - t - 1 +-:-:-:-:00 IADD t, -t, param_T; +-:-:-:-:00 IADD t, t, -one; + +-:-:-:-:00 IMAD rst_prime, r, param_S, s; +-:-:-:-:00 IMAD rst_prime, t, param_RS, rst_prime; + +// x_prime = x / str_w +// x = x % str_w +-:-:-:-:00 IMAD x_prime, x, param_magic_str_w, RZ; +-:-:-:-:00 SHR.U32 x_prime, x_prime, param_shift_str_w; +-:-:-:-:00 IMAD tmp_param0, str_w, x_prime, RZ; +-:-:-:-:00 IADD x, -tmp_param0, x; +// y_prime = y / str_h +// y = y % str_h +-:-:-:-:00 IMAD y_prime, y, param_magic_str_h, RZ; +-:-:-:-:00 SHR.U32 y_prime, y_prime, param_shift_str_h; +-:-:-:-:00 IMAD tmp_param0, str_h, y_prime, RZ; +-:-:-:-:00 IADD y, -tmp_param0, y; +// z_prime = z / str_d +// z = z % str_d +-:-:-:-:00 IMAD z_prime, z, param_magic_str_d, RZ; +-:-:-:-:00 SHR.U32 z_prime, z_prime, param_shift_str_d; +-:-:-:-:00 IMAD tmp_param0, str_d, z_prime, RZ; +-:-:-:-:00 IADD z, -tmp_param0, z; + +// calculate x_prime only when x % str_w == 0 +// it may be greater than Q due to its location +-:-:-:-:00 ISETP.EQ.AND P4, PT, x, RZ, P4; +-:-:-:-:00 ISETP.EQ.AND P5, PT, y, RZ, P5; +-:-:-:-:00 ISETP.EQ.AND P6, PT, z, RZ, P6; +-:-:-:-:00 ISETP.LT.AND P4, PT, x_prime, param_W, P4; +-:-:-:-:00 ISETP.LT.AND P5, PT, y_prime, param_H, P5; +-:-:-:-:00 ISETP.LT.AND P6, PT, z_prime, param_D, P6; +-:-:-:-:00 PSETP.AND.AND P1, PT, P4, P5, P6; + +// sliceI = z_prime*HWN + y_prime*WN + x_prime*N +-:-:-:-:00 IMAD sliceI, x_prime, param_N, RZ; +-:-:-:-:00 IMAD.U32.U32 sliceI, y_prime, param_WN, sliceI; +-:-:-:-:00 IMAD.U32.U32 sliceI, z_prime, param_HWN, sliceI; +// sliceF = rst_prime * K +-:-:-:-:00 IMAD sliceF, rst_prime, param_K, RZ; + +// Get a mask of all valid slices in the warp +-:-:-:-:00 VOTE.ANY ballot, PT, P1; +// Count the total valid slices +-:-:-:-:00 POPC warp_slices, ballot, ballot; +// Prepare lutStore for this and next loop +-:-:-:-:00 @P1 MOV lutStore, lutStore2; +-:-:-:-:00 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +-:-:-:-:00 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +-:-:-:-:00 @P1 POPC dep_thd_cnt, dep_thd_bits, dep_thd_bits; +// use the rst increment to space the barrier sync +-:-:-:-:00 IADD rst, rst, 32; +// Update the lutStore address from this count +-:-:-:-:00 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +-:-:-:-:00 @P1 STS.64 [lutStore + addr_lut], sliceIF; +// Keep track of the total size of the lut +-:-:-:-:00 IADD lutSize, lutSize, warp_slices; + +-:-:-:-:00 @P0 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +-:-:-:-:00 STS [RZ + addr_szLut], lutSize; + +END_SETUP: + +-:-:-:-:00 BAR.SYNC 0; + +// Grab the caclulated lut size and get it's reciprical +// Get the total reduction depth +-:-:-:-:00 LDS lutSize, [RZ + addr_szLut]; +-:-:-:-:00 IMAD endCRST, lutSize, param_C, RZ; +-:-:-:-:00 I2F.F32.S32 lutSizeRcp, lutSize; +-:-:-:-:00 MUFU.RCP lutSizeRcp, lutSizeRcp; + +// posCRST = endCRST - tidY - 1 +-:-:-:-:00 IADD posCRST, endCRST, -1; +-:-:-:-:00 IADD posCRST, posCRST, -tidY; +// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch. +// If it is a multiple of 8 then make a full 8 line fetch. +-:-:-:-:00 LOP.AND partial, endCRST, 7; +-:-:-:-:00 ISETP.EQ.AND P1, PT, RZ, partial, PT; +-:-:-:-:00 @P1 MOV32I partial, 8; +// channel = posCRST / lutSize +// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it +-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST; +-:-:-:-:00 FMUL channel, posCRSTf, lutSizeRcp; +-:-:-:-:00 FFMA channel, channel, 5.9604644775390625e-08, channel; +-:-:-:-:00 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +-:-:-:-:00 IMAD tmp_param0, channel, lutSize, RZ; +-:-:-:-:00 IADD lutOffset, -tmp_param0, posCRST; + +-:-:-:-:00 SHL lutOffset, lutOffset, 3; +// P1 = tidY < partial +-:-:-:-:00 ISETP.LT.AND P1, PT, tidY, partial, PT; +// offsetIC = channel * DHWN +// offsetFC = channel * K +-:-:-:-:00 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ; +-:-:-:-:00 IMAD offsetFc, channel, param_KRST, RZ; +// posCRST -= partial +-:-:-:-:00 IADD posCRST, posCRST, -partial; +-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut]; + +// trackI = offsetIN + offsetIC + sliceI + param_I +-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc; +-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF; +// trackF = offsetFK + offsetFC + sliceF + param_F +-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc; +-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI; + +//-:-:-:-:00 @P1 LEA trackF0.CC, offsetF, param_F[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2; +-:-:-:-:00 @P1 MOV tmp_param0, param_F[0]; +-:-:-:-:00 @P1 MOV tmp_param1, param_F[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2; +-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 @P1 IADD.X trackF1, RZ, tmp_param1; +//-:-:-:-:00 @P1 LEA trackI0.CC, offsetI, param_I[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2; +-:-:-:-:00 @P1 MOV tmp_param0, param_I[0]; +-:-:-:-:00 @P1 MOV tmp_param1, param_I[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2; +-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 @P1 IADD.X trackI1, RZ, tmp_param1; + +-:-:-:-:00 @P1 LD.E.CI.128 loadF0, [trackF + 4x< 0>]; +-:-:-:-:00 @P1 LD.E.CI.128 loadF4, [trackF + 4x<32>]; +-:-:-:-:00 @!P1 LDS.128 loadF0, [RZ + addr_zero]; +-:-:-:-:00 @!P1 LDS.128 loadF4, [RZ + addr_zero]; + +-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI + 4x< 0>]; +-:-:-:-:00 @P1 LD.E.128 loadI4, [trackI + 4x<32>]; +-:-:-:-:00 @!P1 LDS.128 loadI0, [RZ + addr_zero]; +-:-:-:-:00 @!P1 LDS.128 loadI4, [RZ + addr_zero]; + +-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT; + +-:-:-:-:00 STS.128 [writeS + 4x<0*64 + 0>], loadF0; +-:-:-:-:00 STS.128 [writeS + 4x<0*64 + 32>], loadF4; + +-:-:-:-:00 STS.128 [writeS + 4x<8*64 + 0>], loadI0; +-:-:-:-:00 STS.128 [writeS + 4x<8*64 + 32>], loadI4; + +-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST; + +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 LOP.XOR writeS, writeS, 4x<64*8*2>; + +-:-:-:-:00 LDS.128 j0Ix0, [readIs + 4x<0*64 + 00>]; +-:-:-:-:00 LDS.128 j0Fy0, [readFs + 4x<0*64 + 00>]; +-:-:-:-:00 LDS.128 j0Ix4, [readIs + 4x<0*64 + 32>]; +-:-:-:-:00 LDS.128 j0Fy4, [readFs + 4x<0*64 + 32>]; + +// channel = posCRST / lutSize +-:-:-:-:00 @P1 FMUL channel, posCRSTf, lutSizeRcp; +-:-:-:-:00 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel; +-:-:-:-:00 @P1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +-:-:-:-:00 @P1 IMAD tmp_param0, channel, lutSize, RZ; +-:-:-:-:00 @P1 IADD lutOffset, -tmp_param0, posCRST; +-:-:-:-:00 @P1 SHL lutOffset, lutOffset, 3; +// offsetIC = channel * DHWN +// offsetFC = channel * K +-:-:-:-:00 @P1 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ; +-:-:-:-:00 @P1 IMAD offsetFc, channel, param_KRST, RZ; + +-:-:-:-:00 IADD posCRST, posCRST, -8; +-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut]; + +// trackI = offsetIN + offsetIC + sliceI + param_I +-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc; +-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF; +// trackF = offsetFK + offsetFC + sliceF + param_F +-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc; +-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI; +//-:-:-:-:00 @P1 LEA trackF0.CC, offsetF, param_F[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2; +-:-:-:-:00 @P1 MOV addressF0, param_F[0]; +-:-:-:-:00 @P1 MOV addressF1, param_F[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2; +-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, addressF0; +-:-:-:-:00 @P1 IADD.X trackF1, RZ, addressF1; +//-:-:-:-:00 @P1 LEA trackI0.CC, offsetI, param_I[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2; +-:-:-:-:00 @P1 MOV addressI0, param_I[0]; +-:-:-:-:00 @P1 MOV addressI1, param_I[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2; +-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, addressI0; +-:-:-:-:00 @P1 IADD.X trackI1, RZ, addressI1; +-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF + 4x< 0>]; +-:-:-:-:00 @P1 LD.E.128 loadF4, [trackF + 4x<32>]; +-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI + 4x< 0>]; +-:-:-:-:00 @P1 LD.E.128 loadI4, [trackI + 4x<32>]; +-:-:-:-:00 MOV32I tmp_data, 128; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + +LOOP: + + + my %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + j0c61 => "-:-:-:-:00 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + j0c62 => "-:-:-:-:00 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c63 => "-:-:-:-:00 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + + j1c47 => "-:-:-:-:00 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + j1c53 => "-:-:-:-:00 \@P1 IMAD tmp_param0, channel, lutSize, RZ;\n", + j1c61 => "-:-:-:-:00 \@P1 IADD lutOffset, -tmp_param0, posCRST;\n", + j1c62 => "-:-:-:-:00 \@P1 IMAD offsetF, channel, param_KRST, offsetFk;\n", + j1c63 => "-:-:-:-:00 \@P1 IMAD offsetI, channel, param_DHWN, offsetIn;\n", + + j2c47 => "-:-:-:-:00 \@P1 SHL lutOffset, lutOffset, 3;\n", + j2c53 => "-:-:-:-:00 IADD posCRST, posCRST, -8;\n", + j2c61 => "-:-:-:-:00 \@P1 LDS.64 sliceIF, [lutOffset + addr_lut];\n", + + j3c47 => "-:-:-:-:00 \@P1 IADD offsetF, offsetF, sliceF;\n", + j3c53 => "-:-:-:-:00 TEXDEPBAR 0x0;\n", + j3c61 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetF, 0x2;\n", + j3c62 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*64 + 0>], loadF0;\n", + j3c63 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*64 + 32>], loadF4;\n", + + j4c47 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*64 + 0>], loadI0;\n", + j4c53 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*64 + 32>], loadI4;\n", + j4c61 => "-:-:-:-:00 \@P1 IADD trackF0.CC, tmp_shl, addressF0;\n", + j4c62 => "-:-:-:-:00 \@P1 IADD.X trackF1, RZ, addressF1;\n", + j4c63 => "-:-:-:-:00 \@P1 IADD offsetI, offsetI, sliceI;\n", + + j5c47 => "-:G:D:-:00 \@P1 LDG.E.128 loadF0, [trackF];\n", + j5c53 => "-:-:-:-:00 \@P1 IADD tmp_param0.CC, tmp_data, trackF0;\n", + j5c61 => "-:-:-:-:00 \@P1 IADD.X tmp_param1, RZ, trackF1;\n", + j5c62 => "-:G:D:-:00 \@P1 LDG.E.128 loadF4, [tmp_param];\n", + j5c63 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n", + + j6c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetI, 0x2;\n", + j6c53 => "-:-:-:-:00 \@P1 IADD trackI0.CC, tmp_shl, addressI0;\n", + j6c61 => "-:-:-:-:00 \@P1 IADD.X trackI1, RZ, addressI1;\n", + + j6c63 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n". + "-:-:-:-:00 \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 NOP;\n", + + j7c47 => "-:G:D:-:00 \@P1 LDG.E.128 loadI0, [trackI + 4x<0>];\n", + j7c53 => "-:-:-:-:00 \@P1 IADD tmp_param0.CC, tmp_data, trackI0;\n", + + j7c61 => "-:-:-:-:00 \@P1 IADD.X tmp_param1, RZ, trackI1;\n", + j7c62 => "-:G:D:-:00 \@P1 LDG.E.128 loadI4, [tmp_param];\n", + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n", + ); + + my @cOrder; + + push @cOrder, [0,0]; + push @cOrder, [0,1]; + push @cOrder, [1,1]; + push @cOrder, [2,0]; + push @cOrder, [1,0]; + push @cOrder, [2,1]; + push @cOrder, [2,3]; + push @cOrder, [2,2]; + push @cOrder, [1,2]; + push @cOrder, [0,3]; + push @cOrder, [1,3]; + push @cOrder, [0,2]; + push @cOrder, [0,4]; + push @cOrder, [0,5]; + push @cOrder, [1,5]; + push @cOrder, [2,4]; + push @cOrder, [1,4]; + push @cOrder, [2,5]; + push @cOrder, [2,7]; + push @cOrder, [2,6]; + push @cOrder, [1,6]; + push @cOrder, [0,7]; + push @cOrder, [1,7]; + push @cOrder, [0,6]; + push @cOrder, [3,6]; + push @cOrder, [3,7]; + push @cOrder, [4,7]; + push @cOrder, [5,6]; + push @cOrder, [4,6]; + push @cOrder, [5,7]; + push @cOrder, [5,5]; + push @cOrder, [5,4]; + push @cOrder, [4,4]; + push @cOrder, [3,5]; + push @cOrder, [4,5]; + push @cOrder, [3,4]; + push @cOrder, [3,2]; + push @cOrder, [3,3]; + push @cOrder, [4,3]; + push @cOrder, [5,2]; + push @cOrder, [4,2]; + push @cOrder, [5,3]; + push @cOrder, [5,1]; + push @cOrder, [5,0]; + push @cOrder, [4,0]; + push @cOrder, [3,1]; + push @cOrder, [4,1]; + push @cOrder, [3,0]; + push @cOrder, [6,0]; + push @cOrder, [7,0]; + push @cOrder, [7,1]; + push @cOrder, [6,2]; + push @cOrder, [6,1]; + push @cOrder, [7,2]; + push @cOrder, [7,5]; + push @cOrder, [6,5]; + push @cOrder, [6,4]; + push @cOrder, [7,3]; + push @cOrder, [7,4]; + push @cOrder, [6,3]; + push @cOrder, [6,6]; + push @cOrder, [6,7]; + push @cOrder, [7,7]; + push @cOrder, [7,6]; + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c5"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx0, [readIs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx2, [readIs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx4, [readIs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx6, [readIs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy2, [readFs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy6, [readFs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $ctrl = "-:-:-:-:00"; + + if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) { + $ins = "-:G:D:-:00 NOP;\n"; + } + + if ($c > 60 && !$ins){ + $ins = "-:-:D:-:07 NOP;\n"; + } + + # 04 and 05 are dual issued + if($ins) { + $ctrl = "-:-:D:-:04"; + } else { + if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){ + $ctrl = "-:-:D:-:04"; + } + else{ + $ctrl = "-:-:D:-:05"; + } + } + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +-:-:-:-:00 LDS.128 mpq, [RZ + addr_m]; +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkI, SR_CTAID.Z; +-:-:-:-:00 S2R blkF, SR_CTAID.Y; + +// tidOX = (tid & 7) << 2 +// tidOY = tid >> 3 +-:-:-:-:00 LOP.AND tidOX, tid, 7; +-:-:-:-:00 SHL tidOX, tidOX, 2; +-:-:-:-:00 SHR.U32 tidOY, tid, 3; + +-:-:-:-:00 LOP.AND readIs, readIs, 0x7ff; +-:-:-:-:00 LOP.AND readFs, readFs, 0x7ff; + +// Div by 4 here collapses k stride +// writeCs = (readKs / 4) * 64 + readNs; +-:-:-:-:00 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +-:-:-:-:00 ISCADD readCs, tidOY, tidOX, 6; +-:-:-:-:00 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +-:-:-:-:00 ISCADD n, blkI, tidOX, 6; + +// Mul by 4 here expands k stride back out +// k = blkF*64 + tidOY * 4 +-:-:-:-:00 SHL tidOY, tidOY, 2; +-:-:-:-:00 ISCADD k, blkF, tidOY, 6; + +// o = k*MPQN + m*PQN + p*QN + q*N + n +-:-:-:-:00 IMAD to, q, param_N, n; +-:-:-:-:00 IMAD.U32.U32 to, p, param_QN, to; +-:-:-:-:00 IMAD.U32.U32 to, m, param_PQN, to; +-:-:-:-:00 IMAD.U32.U32 to, k, param_MPQN, to; +//-:-:-:-:00 LEA Out0.CC, to, param_O[0], 2; +//-:-:-:-:00 LEA.HI.X Out1, to, param_O[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_O[0]; +-:-:-:-:00 MOV tmp_param1, param_O[1]; +-:-:-:-:00 SHL tmp_shl, to, 0x2; +-:-:-:-:00 IADD Out0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X Out1, RZ, tmp_param1; + +-:-:-:-:00 MOV MPQN, param_MPQN; +-:-:-:-:00 SHL MPQN1, MPQN, 2; +-:-:-:-:00 SHL MPQN4, MPQN, 4; +-:-:-:-:00 ISCADD MPQN28, MPQN, -MPQN4, 7; + +-:-:-:-:00 ISETP.LT.AND P0, PT, n, param_N, PT; // n + 0 < N +-:-:-:-:00 IADD n, n, 32; +-:-:-:-:00 ISETP.LT.AND P1, PT, n, param_N, PT; // n + 32 < N + +-:-:-:-:00 MOV alpha, param_alpha; + +-:-:-:-:00 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + if ($y == 4) + { + $out .= sprintf( + "-:-:-:-:00 IADD Out0.CC, Out0, MPQN28;\n" . + "-:-:-:-:00 IADD k, k, 28;\n" . + "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n" . + "-:-:-:-:00 IADD.X Out1, Out1, RZ;\n\n", + ($y) x 8); + } + else + { + $out .= sprintf( + "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n\n", + ($y) x 8); + } + + $out .= "-:-:-:-:00 CAL STORE_C;\n\n"; + } + return $out; + + + +-:-:-:-:00 EXIT; + +STORE_C: + +-:-:-:-:00 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n + 0 < N +-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n + 32 < N +-:-:-:-:00 IADD k, k, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +-:-:-:-:00 STS.128 [writeCs + 4x<00>], cs0; +-:-:-:-:00 STS.128 [writeCs + 4x<32>], cs4; +-:-:-:-:00 LDS.128 cs0, [readCs + 4x<00>]; +-:-:-:-:00 LDS.128 cs4, [readCs + 4x<32>]; + +// Store results back to global +-:-:-:-:00 @P2 ST.E.128 [Out + 4x<00>], cs0; +-:-:-:-:00 @P3 ST.E.128 [Out + 4x<32>], cs4; + +-:-:-:-:00 IADD Out0.CC, Out0, MPQN1; +-:-:-:-:00 IADD.X Out1, Out1, RZ; + +-:-:-:-:00 RET; + diff --git a/Kernel/Convolution/Kepler/sconv_fprop.cu b/Kernel/Convolution/Kepler/sconv_fprop.cu new file mode 100644 index 0000000..51f1979 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_fprop.cu @@ -0,0 +1,221 @@ +#include "sconv.h" + +bool fprop_K64_N64(const float *I, const float *F, float *O, + unsigned int N, unsigned int C, unsigned int K, + unsigned int D, unsigned int H, unsigned int W, + unsigned int R, unsigned int S, unsigned int T, + unsigned int M, unsigned int P, unsigned int Q, + unsigned int str_d, unsigned int str_h, unsigned int str_w, + unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) { + std::string kernel_name = "sconv_fprop_K64_N64"; + float alpha = 1.0f; + unsigned int WN, HWN, DHWN, KRST, RST, RS, PQ, QN, PQM, PQN, MPQN; + unsigned int magic_RS, magic_S; + unsigned int shift_RS, shift_S; + unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ; + // input + WN = W * N; + HWN = H * WN; + DHWN = D * HWN; + // filter + RS = R * S; + RST = T * RS; + KRST = K * RST; + // output + QN = Q * N; + PQ = P * Q; + PQM = PQ * M; + PQN = P * QN; + MPQN = M * PQN; + // magic numbers + magic32(PQ, Q, magic_Q, shift_Q); + magic32(PQM, PQ, magic_PQ, shift_PQ); + magic32(RST + 32, RS, magic_RS, shift_RS); + magic32(RS + 32, S, magic_S, shift_S); + // test param set up + float *test_param; + cudaError_t cuda_error; + cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024); + cudaMemset(test_param, 0, sizeof(float) * 1024); + // arguments + void *args[37] = {&test_param, &O, &I, &F, &alpha, + &N, &K, &D, &H, &W, &WN, &HWN, &DHWN, + &C, &KRST, &RST, &RS, &magic_RS, &shift_RS, &S, &magic_S, &shift_S, + &pad_d, &pad_h, &pad_w, &str_d, &str_h, &str_w, + &Q, &PQ, &QN, &PQN, &MPQN, &magic_Q, &shift_Q, &magic_PQ, &shift_PQ}; + int gridMPQ = M * P * Q; + int gridX = gridMPQ; + int gridY = K / 64 + (K % 64 != 0); + int gridZ = N / 64 + (N % 64 != 0); + CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], + gridX, gridY, gridZ, 64, 1, 1, 64 * 8 * 4 + RST * 4 * 2 + 8, 0, args, NULL); + if (res != CUDA_SUCCESS) { + std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl; + return false; + } + cuCtxSynchronize(); + // output test_param + float* h_test = (float *)malloc(sizeof(float) * 64); + for (int i = 0; i < 64; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 64, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 64; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + // free test_param + free(h_test); + return true; +} + +bool fprop_K128_N128(const float *I, const float *F, float *O, + unsigned int N, unsigned int C, unsigned int K, + unsigned int D, unsigned int H, unsigned int W, + unsigned int R, unsigned int S, unsigned int T, + unsigned int M, unsigned int P, unsigned int Q, + unsigned int str_d, unsigned int str_h, unsigned int str_w, + unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) { + std::string kernel_name = "sconv_fprop_K128_N128"; + float alpha = 1.0f; + unsigned int WN, HWN, DHWN, KRST, RST, RS, PQ, QN, PQM, PQN, MPQN; + unsigned int magic_RS, magic_S; + unsigned int shift_RS, shift_S; + unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ; + // input + WN = W * N; + HWN = H * WN; + DHWN = D * HWN; + // filter + RS = R * S; + RST = T * RS; + KRST = K * RST; + // output + QN = Q * N; + PQ = P * Q; + PQM = PQ * M; + PQN = P * QN; + MPQN = M * PQN; + // magic numbers + magic32(PQ, Q, magic_Q, shift_Q); + magic32(PQM, PQ, magic_PQ, shift_PQ); + magic32(RST + 32, RS, magic_RS, shift_RS); + magic32(RS + 32, S, magic_S, shift_S); + // test param set up + float *test_param; + cudaError_t cuda_error; + cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024); + cudaMemset(test_param, 0, sizeof(float) * 1024); + // arguments + void *args[37] = {&test_param, &O, &I, &F, &alpha, + &N, &K, &D, &H, &W, &WN, &HWN, &DHWN, + &C, &KRST, &RST, &RS, &magic_RS, &shift_RS, &S, &magic_S, &shift_S, + &pad_d, &pad_h, &pad_w, &str_d, &str_h, &str_w, + &Q, &PQ, &QN, &PQN, &MPQN, &magic_Q, &shift_Q, &magic_PQ, &shift_PQ}; + int gridMPQ = M * P * Q; + int gridX = gridMPQ; + int gridY = K / 128 + (K % 128 != 0); + int gridZ = N / 128 + (N % 128 != 0); + CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], + gridX, gridY, gridZ, 256, 1, 1, 128 * 8 * 4 + RST * 4 * 2 + 8, 0, args, NULL); + if (res != CUDA_SUCCESS) { + std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl; + return false; + } + cuCtxSynchronize(); + // output test_param + float* h_test = (float *)malloc(sizeof(float) * 128); + for (int i = 0; i < 128; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 128, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 128; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + // free test_param + free(h_test); + return true; +} + +int main() { + // init + cudaFree(0); + // params + float *d_I, *d_F, *d_O; + unsigned int N = 128, C = 1, K = 128, D = 1, H = 5, W = 5, T = 1, R = 5, S = 5; + unsigned int str_d = 1, str_h = 1, str_w = 1; + unsigned int pad_d = 0, pad_h = 0, pad_w = 0; + unsigned int M, P, Q; + cudaError_t cuda_error; + M = (D - T + 2 * pad_d) / str_d + 1; + P = (H - R + 2 * pad_h) / str_h + 1; + Q = (W - S + 2 * pad_w) / str_w + 1; + // host memory + float *h_I = (float *)malloc(C * D * H * W * N * sizeof(float)); + for (int i = 0; i < C * D * H * W; ++i) { + for (int j = 0; j < N; ++j) { + h_I[i * N + j] = j; + } + } + float *h_F = (float *)malloc(C * R * S * T * K * sizeof(float)); + for (int i = 0; i < C * R * S * T * K; ++i) { + h_F[i] = 1; + } + float* h_O = (float *)malloc(sizeof(float) * K * M * P * Q * N); + // device memory + cudaMalloc((void**)&d_I, sizeof(float) * C * D * H * W * N); + cudaMalloc((void**)&d_F, sizeof(float) * C * R * S * T * K); + cudaMalloc((void**)&d_O, sizeof(float) * K * M * P * Q * N); + // memcpy h_I, h_F + cudaMemcpy(d_I, h_I, sizeof(float) * C * D * H * W * N, + cudaMemcpyHostToDevice); + cudaMemcpy(d_F, h_F, sizeof(float) * C * R * S * T * K, + cudaMemcpyHostToDevice); + // load kernels + if (!load_kernels("./")) { + std::cerr << "Couldn't load all kernels" << std::endl; + exit(1); + } + // launch kernel + if (K <= 64) { + if (!fprop_K64_N64(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) { + std::cerr << "Launch error" << std::endl; + } + } else { + if (!fprop_K128_N128(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) { + std::cerr << "Launch error" << std::endl; + } + } + // output + std::cout << "Result" << std::endl; + cuda_error = cudaMemcpy(h_O, d_O, sizeof(float) * K * M * P * Q * N, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 100; ++i) { + std::cout << h_O[i] << " "; + } + std::cout << std::endl; + // free memory + free(h_O); + free(h_I); + free(h_F); + cudaFree(d_I); + cudaFree(d_F); + cudaFree(d_O); + // run successfully + std::cout << "finish" << std::endl; + return 0; +} diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.cu b/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.cu new file mode 100644 index 0000000..91f813d --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.cu @@ -0,0 +1,48 @@ +extern "C" +__global__ void sconv_fprop_K128_N128 ( + float* param_test, + float *param_O, + const float *param_I, + const float *param_F, + float param_alpha, + int param_N, + int param_K, + int param_D, + int param_H, + int param_W, + int param_WN, + int param_HWN, + int param_DHWN, + int param_C, + int param_KRST, + int param_RST, + int param_RS, + int param_magic_RS, + int param_shift_RS, + int param_S, + int param_magic_S, + int param_shift_S, + int param_pad_d, + int param_pad_h, + int param_pad_w, + int param_str_d, + int param_str_h, + int param_str_w, + int param_Q, + int param_PQ, + int param_QN, + int param_PQN, + int param_MPQN, + int param_magic_Q, + int param_shift_Q, + int param_magic_PQ, + int param_shift_PQ) { + __shared__ float share[128 * 8 * 4 + 8]; + + int tid = threadIdx.x; + + share[tid] = 1; + + *param_O = share[127-tid]; + *param_test = share[127-tid]; +} diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.sass b/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.sass new file mode 100644 index 0000000..6582360 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.sass @@ -0,0 +1,791 @@ +# Kernel: sconv_fprop_K128_N128 +// debug: +// mode1 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +//-:-:-:-:00 SHL tmp_shl, tid, 0x2; +//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +//-:-:-:-:00 I2F.F32.U32 rst, rst; +//-:-:-:-:00 ST.E [tmp_param00], rst; +//-:-:-:-:00 EXIT; + +// mode2 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +// +//-:-:-:-:00 MOV32I k, 0x40000000; +//-:-:-:-:00 ST.E [tmp_param0], k; +//-:-:-:-:00 EXIT; + +// modify steps: +// XMAD->IMAD +// shared memory addresses->RZ +// LDG->LD +// LEA->MOV, IADD, SHL +// XMAD.LO2C->IMAD.U32.U32 +// XMAD.PSL->IMAD.U32.U32 +// VMAD->IMAD, IADD +// MOV->MOV32I +// IADD3->IADD, IADD +// POPC +// ST.CG->ST +// control code +// comments +// LDS.U->LDS +// register<0-7>->register<0-3>, register<4-7> +// avoid register conflicts + +// optimization steps: +// alexnet2 +// initial->1200 +// bank conflict->1288 +// alignment+dual issue+reuse->1600 +// half ldg.128->1700 +// all ldg.128->1777 +// control codes->1900 +// scheduling->1937 +// reduce unnecessary instructions->2100 + + + szShareF : (128*8) + szShareI : (128*8) + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + addr_m : 4x<128*8*2 + 128*8*2 + 4> + addr_p : 4x<128*8*2 + 128*8*2 + 5> + addr_q : 4x<128*8*2 + 128*8*2 + 6> + addr_szLut : 4x<128*8*2 + 128*8*2 + 7> + addr_lut : 4x<128*8*2 + 128*8*2 + 8> + + param_test[0] : c[0x0][0x140] + param_test[1] : c[0x0][0x144] + param_O[0] : c[0x0][0x148] + param_O[1] : c[0x0][0x14c] + param_I[0] : c[0x0][0x150] + param_I[1] : c[0x0][0x154] + param_F[0] : c[0x0][0x158] + param_F[1] : c[0x0][0x15c] + param_alpha : c[0x0][0x160] + param_N : c[0x0][0x164] + param_K : c[0x0][0x168] + param_D : c[0x0][0x16c] + param_H : c[0x0][0x170] + param_W : c[0x0][0x174] + param_WN : c[0x0][0x178] + param_HWN : c[0x0][0x17c] + param_DHWN : c[0x0][0x180] + param_C : c[0x0][0x184] + param_KRST : c[0x0][0x188] + param_RST : c[0x0][0x18c] + param_RS : c[0x0][0x190] + param_magic_RS : c[0x0][0x194] + param_shift_RS : c[0x0][0x198] + param_S : c[0x0][0x19c] + param_magic_S : c[0x0][0x1a0] + param_shift_S : c[0x0][0x1a4] + param_pad_d : c[0x0][0x1a8] + param_pad_h : c[0x0][0x1ac] + param_pad_w : c[0x0][0x1b0] + param_str_d : c[0x0][0x1b4] + param_str_h : c[0x0][0x1b8] + param_str_w : c[0x0][0x1bc] + param_Q : c[0x0][0x1c0] + param_PQ : c[0x0][0x1c4] + param_QN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_MPQN : c[0x0][0x1d0] + param_magic_Q : c[0x0][0x1d4] + param_shift_Q : c[0x0][0x1d8] + param_magic_PQ : c[0x0][0x1dc] + param_shift_PQ : c[0x0][0x1e0] + + + + + 64-67 : mpq<0-3> + 64-67 : m, p, q, tidY + 68-72 : blkF, blkI, blkMPQ, tid1, tidX + 73-95 ~ pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 : j0Fy<0-3> + 68-71 : j0Ix<0-3> + 72-75 : j0Fy<4-7> + 76-79 : j0Ix<4-7> + 80-83 : j1Fy<0-3> + 84-87 : j1Ix<0-3> + 88-91 : j1Fy<4-7> + 92-95 : j1Ix<4-7> + + 96-97 : trackI<0-1> + 98-99 : trackF<0-1> + + 100-103 : loadI<0-3> + 104-107 : loadF<0-3> + 109 : readFs + 108 : readIs + + 110-114 ~ offsetIn, offsetFk, posCRST, lutSize, lutSizeRcp + 115-120 ~ writeS, posCRSTf, channel, lutOffset, offsetI, offsetF + 116-120 ~ tid128, tid, p_and + 121 : tmp_shl + + 122-123 : sliceI, sliceF + 122-123 : sliceIF<0-1> + 124-125 ~ offsetIc, offsetFc + 124-125 : tmp_param<0-1> + 124-127 ~ addressF0, addressF1, addressI0, addressI1 + + 72-79 : cs<0-7> + 80-81 : Out<0-1> + + 82-120 ~ writeCs, readCs, alpha, tidOX, tidOX2, tidOY, to, k, n, MPQN1, MPQN60, MPQN, MPQN4 + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkF, SR_CTAID.Y; #K128 +-:-:-:-:00 S2R blkI, SR_CTAID.Z; #N128 +-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index + +// if tid > 32 +// P0 = true +-:-:-:-:00 ISETP.GE.AND P0, PT, tid, 32, PT; + +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15; + + +// tid <= 127 +// tidX = (tid & 31) << 2 +// tidX = 0 : 4 : 128 +// tidY = tid >> 5 +// tidY = 0 : 1 : 7 +-:-:-:-:00 LOP.AND tidX, tid, 31; +-:-:-:-:00 SHL tidX, tidX, 2; +-:-:-:-:00 SHR.U32 tidY, tid, 5; + +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +//-:-:-:-:00 SHL tmp_shl, tid, 0x2; +//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +//-:-:-:-:00 I2F.F32.U32 tidY, tidY; +//-:-:-:-:00 ST.E [tmp_param00], tidY; +//-:-:-:-:00 EXIT; + +// offsetFk += blkF * 128 + tidX +// K128 +// blkF ---- trackF +// tidX +-:-:-:-:00 ISCADD offsetFk, blkF, tidX, 7; + +// offsetIn += blkI * 64 + tidX +// N128 +// blkI ---- trackI +// tidX +-:-:-:-:00 ISCADD offsetIn, blkI, tidX, 7; + +// writeS = (128 * tidY + tidX) * 4 +// tidY = 0 : 1 : 7 +// tidX = 0 : 4 : 128 +// ---------------- +// ---------------- tidY 0 : 1 : 7 +// ---- writeS +// tidX +-:-:-:-:00 SHR tidX, tidX, 1; +-:-:-:-:00 ISCADD writeS, tidY, tidX, 7; +-:-:-:-:00 SHL writeS, writeS, 2; + +// readFs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +// [6][5][4][0] * 8; +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 LOP.AND readFs, tid, 0x70; +-:-:-:-:00 SHR.U32 readFs, readFs, 3; +-:-:-:-:00 LOP.OR readFs, readFs, tid1; +-:-:-:-:00 SHL readFs, readFs, 3; + +// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7) +// [3][2][1] * 16; +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 SHR.U32 tid128, tid128, 3; +-:-:-:-:00 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 LOP.OR readIs, readIs, tid128; +-:-:-:-:00 ISCADD readIs, readIs, 4x, 3; + +-:-:-:-:00 @P0 BRA.U END_SETUP; + +-:-:-:-:00 MOV rst, tid; +-:-:-:-:00 MOV lutStore2, RZ; +-:-:-:-:00 MOV lutSize, RZ; +-:-:-:-:00 MOV32I warp_count, 32; + +// m = blkMPQ / PQ +// pq = blkMPQ % PQ +-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ; +-:-:-:-:00 SHR.U32 m, m, param_shift_PQ; +-:-:-:-:00 IMAD pq, m, param_PQ, RZ; +-:-:-:-:00 IADD pq, -pq, blkMPQ; +// p = pq / Q +// q = pq % Q +-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ; +-:-:-:-:00 SHR.U32 p, p, param_shift_Q; +-:-:-:-:00 IMAD q, p, param_Q, RZ; +-:-:-:-:00 IADD q, -q, pq; + +// dep_thd_mask = -1 +-:-:-:-:00 MOV32I dep_thd_mask, -1; + +// if p is odd +// set q = param_Q - q - 1 +// if p is even +// q = q +-:-:-:-:00 LOP.AND p_and, p, 1; +-:-:-:-:00 ISETP.NE.AND P1, PT, p_and, RZ, PT; +-:-:-:-:00 @P1 IADD q, -q, param_Q; +-:-:-:-:00 @P1 IADD q, q, dep_thd_mask; + +-:-:-:-:00 STS.128 [RZ + addr_m], m; + +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +-:-:-:-:00 IMAD qs, q, param_str_w, RZ; +-:-:-:-:00 IMAD pr, p, param_str_h, RZ; +-:-:-:-:00 IMAD mt, m, param_str_d, RZ; +-:-:-:-:00 IADD qs, qs, -param_pad_w; +-:-:-:-:00 IADD pr, pr, -param_pad_h; +-:-:-:-:00 IADD mt, mt, -param_pad_d; + +// mask_shr = 32 - tid +// dep_thd_mask = dep_thd_mask >> mask_shr +-:-:-:-:00 IADD mask_shr, -tid, 32; +-:-:-:-:00 SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr; + +LUT_LOOP: + +// warp synchronous loop while warp_count < RST +-:-:-:-:00 ISETP.LT.AND P0, PT, warp_count, param_RST, PT; +-:-:-:-:00 IADD warp_count, warp_count, 32; +// t = rst / RS +// rs = rst % RS +-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ; +-:-:-:-:00 SHR.U32 t, t, param_shift_RS; +-:-:-:-:00 IMAD rs, t, param_RS, RZ; +-:-:-:-:00 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ; +-:-:-:-:00 SHR.U32 r, r, param_shift_S; +-:-:-:-:00 IMAD s, r, param_S, RZ; +-:-:-:-:00 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +-:-:-:-:00 IADD x, qs, s; +-:-:-:-:00 IADD y, pr, r; +-:-:-:-:00 IADD z, mt, t; +-:-:-:-:00 ISETP.GE.AND P4, PT, x, RZ, PT; +-:-:-:-:00 ISETP.GE.AND P5, PT, y, RZ, PT; +-:-:-:-:00 ISETP.GE.AND P6, PT, z, RZ, PT; +-:-:-:-:00 ISETP.LT.AND P4, PT, x, param_W, P4; +-:-:-:-:00 ISETP.LT.AND P5, PT, y, param_H, P5; +-:-:-:-:00 ISETP.LT.AND P6, PT, z, param_D, P6; +-:-:-:-:00 PSETP.AND.AND P1, PT, P4, P5, P6; + +// sliceI = z*HWN + y*WN + x*N +// rst N +// -------------- -------------- +// -------------- -------------- +// -------------- -------------- +// -------------- K * rst -------------- +// -------------- -------------- +// -------------- -------------- +-:-:-:-:00 IMAD sliceI, x, param_N, RZ; +-:-:-:-:00 IMAD.U32.U32 sliceI, y, param_WN, sliceI; +-:-:-:-:00 IMAD.U32.U32 sliceI, z, param_HWN, sliceI; + +// sliceF = rst * K +-:-:-:-:00 IMAD sliceF, rst, param_K, RZ; + +// Get a mask of all valid slices in the warp +-:-:-:-:00 VOTE.ANY ballot, PT, P1; +// Count the total valid slices +-:-:-:-:00 POPC warp_slices, ballot, ballot; +// Prepare lutStore for this and next loop +// lutStore = lutStore2 +// lutStore2 = warp_slices * 8 + lutStore2 +-:-:-:-:00 @P1 MOV lutStore, lutStore2; +-:-:-:-:00 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +// bit(dep_thd_mask) = tid +// bit(ballot) = valid tid +// dep_thd_cnt = number of bit below ballot +-:-:-:-:00 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +-:-:-:-:00 @P1 POPC dep_thd_cnt, dep_thd_bits, dep_thd_bits; +// use the rst increment to space the barrier sync +// rst = rst + 32 +-:-:-:-:00 IADD rst, rst, 32; +// Update the lutStore address from this count +// lutStore = dep_thd_cnt * 8 + lutStore +-:-:-:-:00 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +-:-:-:-:00 @P1 STS.64 [lutStore + addr_lut], sliceIF; + +// Keep track of the total size of the lut +// lutSize = lutSize + warp_slices +-:-:-:-:00 IADD lutSize, lutSize, warp_slices; + +-:-:-:-:00 @P0 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +-:-:-:-:00 STS [RZ + addr_szLut], lutSize; + +// if tid >= 32, directly enter it +END_SETUP: + +-:-:-:-:00 BAR.SYNC 0; + +// Grab the caclulated lut size and get it's reciprical +// Get the total reduction depth +-:-:-:-:00 LDS lutSize, [RZ + addr_szLut]; + +// endCRST = lutSize * param_C (channel) +-:-:-:-:00 IMAD endCRST, lutSize, param_C, RZ; +// lutSizeRcp = 1 / lutSize +-:-:-:-:00 I2F.F32.S32 lutSizeRcp, lutSize; +-:-:-:-:00 MUFU.RCP lutSizeRcp, lutSizeRcp; + +// posCRST = endCRST - tidY - 1 +-:-:-:-:00 IADD posCRST, endCRST, -1; +-:-:-:-:00 IADD posCRST, posCRST, -tidY; + +// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch. +// If it is a multiple of 8 then make a full 8 line fetch. +-:-:-:-:00 LOP.AND partial, endCRST, 7; +-:-:-:-:00 ISETP.EQ.AND P1, PT, RZ, partial, PT; +// If partial == 0 +-:-:-:-:00 @P1 MOV32I partial, 8; +// channel = lower(posCRST / lutSize) +// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it +-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST; +-:-:-:-:00 FMUL channel, posCRSTf, lutSizeRcp; +-:-:-:-:00 FFMA channel, channel, 5.9604644775390625e-08, channel; +-:-:-:-:00 F2I.S32.F32.TRUNC channel, channel; + +// lutOffset = (posCRST % lutSize) * 8 +-:-:-:-:00 IMAD tmp_param0, channel, lutSize, RZ; +-:-:-:-:00 IADD lutOffset, -tmp_param0, posCRST; +-:-:-:-:00 SHL lutOffset, lutOffset, 3; +// P1 = tidY < partial +-:-:-:-:00 ISETP.LT.AND P1, PT, tidY, partial, PT; + +// offsetIC = channel * DHWN +// offsetFC = channel * K +-:-:-:-:00 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ; +-:-:-:-:00 IMAD offsetFc, channel, param_KRST, RZ; +// posCRST -= partial +-:-:-:-:00 IADD posCRST, posCRST, -partial; +-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut]; + +// trackF = offsetFK + offsetFC + sliceF + param_F +-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc; +-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF; +// trackI = offsetIN + offsetIC + sliceI + param_I +-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc; +-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI; + +//-:-:-:-:00 @P1 LEA trackF0.CC, offsetF, param_F[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_F[0]; +-:-:-:-:00 MOV tmp_param1, param_F[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2; +-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 @P1 IADD.X trackF1, RZ, tmp_param1; +//-:-:-:-:00 @P1 LEA trackI0.CC, offsetI, param_I[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_I[0]; +-:-:-:-:00 MOV tmp_param1, param_I[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2; +-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 @P1 IADD.X trackI1, RZ, tmp_param1; + +-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF]; +-:-:-:-:00 @!P1 LDS.128 loadF0, [RZ + addr_zero]; + +-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI]; +-:-:-:-:00 @!P1 LDS.128 loadI0, [RZ + addr_zero]; + +-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT; + +-:-:-:-:00 STS.64 [writeS], loadF0; +-:-:-:-:00 STS.64 [writeS + 4x<64>], loadF2; +-:-:-:-:00 STS.64 [writeS + 4x], loadI0; +-:-:-:-:00 STS.64 [writeS + 4x], loadI2; + +-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST; + +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 LOP.XOR writeS, writeS, 4x; + +-:-:-:-:00 LDS.64 j0Ix0, [readIs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS.64 j0Ix2, [readIs + 4x<0*128 + 64>]; +-:-:-:-:00 LDS.64 j0Fy0, [readFs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS.64 j0Fy2, [readFs + 4x<0*128 + 64>]; + +-:-:-:-:00 LDS.64 j0Ix4, [readIs + 4x<0*128 + 16>]; +-:-:-:-:00 LDS.64 j0Ix6, [readIs + 4x<0*128 + 80>]; +-:-:-:-:00 LDS.64 j0Fy4, [readFs + 4x<0*128 + 32>]; +-:-:-:-:00 LDS.64 j0Fy6, [readFs + 4x<0*128 + 96>]; + +// channel = posCRST / lutSize +-:-:-:-:00 @P1 FMUL channel, posCRSTf, lutSizeRcp; +-:-:-:-:00 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel; +-:-:-:-:00 @P1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +-:-:-:-:00 @P1 IMAD tmp_param0, channel, lutSize, RZ; +-:-:-:-:00 @P1 IADD lutOffset, -tmp_param0, posCRST; +-:-:-:-:00 @P1 SHL lutOffset, lutOffset, 3; +// offsetIC = channel * DHWN +// offsetFC = channel * K +-:-:-:-:00 @P1 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ; +-:-:-:-:00 @P1 IMAD offsetFc, channel, param_KRST, RZ; + +-:-:-:-:00 IADD posCRST, posCRST, -8; +-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut]; + +// trackI = offsetIN + offsetIC + sliceI + param_I +// trackF = offsetFK + offsetFC + sliceF + param_F +-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc; +-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF; +-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc; +-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI; + +//-:-:-:-:00 @P1 LEA trackF0.CC, offsetF, param_F[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_F[0]; +-:-:-:-:00 MOV tmp_param1, param_F[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2; +-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 @P1 IADD.X trackF1, RZ, tmp_param1; + +//-:-:-:-:00 @P1 LEA trackI0.CC, offsetI, param_I[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_I[0]; +-:-:-:-:00 MOV tmp_param1, param_I[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2; +-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 @P1 IADD.X trackI1, RZ, tmp_param1; + +-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF]; +-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI]; + +-:-:-:-:00 MOV addressF0, param_F[0]; +-:-:-:-:00 MOV addressF1, param_F[1]; +-:-:-:-:00 MOV addressI0, param_I[0]; +-:-:-:-:00 MOV addressI1, param_I[1]; + +LOOP: + + + my %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + #warps + j0c62 => "-:-:-:-:00 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + #2 + j0c63 => "-:-:-:-:00 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + + j1c47 => "-:-:D:-:05 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j1c63 => "-:-:-:-:00 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j2c47 => "-:-:-:-:00 \@P1 IMAD lutOffset, -channel, lutSize, posCRST;\n", + j2c53 => "-:-:-:-:00 \@P1 IMAD offsetF, channel, param_KRST, offsetFk;\n", + j2c61 => "-:-:-:-:00 \@P1 IMAD offsetI, channel, param_DHWN, offsetIn;\n", + j2c62 => "-:-:-:-:00 \@P1 SHL lutOffset, lutOffset, 3;\n", + j2c63 => "-:-:-:-:00 IADD posCRST, posCRST, -8;\n", + + j3c47 => "-:-:-:-:00 \@P1 LDS.64 sliceIF, [lutOffset + addr_lut];\n", + j3c53 => "-:-:-:-:00 TEXDEPBAR 0x0;\n", + j3c61 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x], loadI0;\n", + j3c62 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x], loadI2;\n", + j3c63 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<0>], loadF0;\n", + + j4c47 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<64>], loadF2;\n", + j4c53 => "-:-:-:-:00 \@P1 IADD offsetF, offsetF, sliceF;\n", + #5 + j4c61 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetF, 0x2;\n", + j4c62 => "-:-:-:-:00 \@P1 IADD offsetI, offsetI, sliceI;\n", + j4c63 => "-:-:-:-:00 \@P1 IADD trackF0.CC, tmp_shl, addressF0;\n", + + j5c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetI, 0x2;\n", + j5c53 => "-:-:-:-:00 \@P1 IADD trackI0.CC, tmp_shl, addressI0;\n", + j5c61 => "-:-:-:-:00 \@P1 IADD.X trackF1, RZ, addressF1;\n", + j5c62 => "-:-:-:-:00 \@P1 IADD.X trackI1, RZ, addressI1;\n", + + j6c47 => "-:G:D:-:00 \@P1 LDG.E.128 loadI0, [trackI];\n", + j6c53 => "-:G:D:-:00 \@P1 LDG.E.128 loadF0, [trackF];\n", + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readIs, readIs, 4x;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readFs, readFs, 4x;\n", + j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n", + + j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x;\n", + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n", + ); + + my @cOrder; + + push @cOrder, [0,0]; + push @cOrder, [0,1]; + push @cOrder, [1,1]; + push @cOrder, [2,0]; + push @cOrder, [1,0]; + push @cOrder, [2,1]; + push @cOrder, [2,3]; + push @cOrder, [2,2]; + push @cOrder, [1,2]; + push @cOrder, [0,3]; + push @cOrder, [1,3]; + push @cOrder, [0,2]; + push @cOrder, [0,4]; + push @cOrder, [0,5]; + push @cOrder, [1,5]; + push @cOrder, [2,4]; + push @cOrder, [1,4]; + push @cOrder, [2,5]; + push @cOrder, [2,7]; + push @cOrder, [2,6]; + push @cOrder, [1,6]; + push @cOrder, [0,7]; + push @cOrder, [1,7]; + push @cOrder, [0,6]; + push @cOrder, [3,6]; + push @cOrder, [3,7]; + push @cOrder, [4,7]; + push @cOrder, [5,6]; + push @cOrder, [4,6]; + push @cOrder, [5,7]; + push @cOrder, [5,5]; + push @cOrder, [5,4]; + push @cOrder, [4,4]; + push @cOrder, [3,5]; + push @cOrder, [4,5]; + push @cOrder, [3,4]; + push @cOrder, [3,2]; + push @cOrder, [3,3]; + push @cOrder, [4,3]; + push @cOrder, [5,2]; + push @cOrder, [4,2]; + push @cOrder, [5,3]; + push @cOrder, [5,1]; + push @cOrder, [5,0]; + push @cOrder, [4,0]; + push @cOrder, [3,1]; + push @cOrder, [4,1]; + push @cOrder, [3,0]; + push @cOrder, [6,0]; + push @cOrder, [7,0]; + push @cOrder, [7,1]; + push @cOrder, [6,2]; + push @cOrder, [6,1]; + push @cOrder, [7,2]; + push @cOrder, [7,5]; + push @cOrder, [6,5]; + push @cOrder, [6,4]; + push @cOrder, [7,3]; + push @cOrder, [7,4]; + push @cOrder, [6,3]; + push @cOrder, [6,6]; + push @cOrder, [6,7]; + push @cOrder, [7,7]; + push @cOrder, [7,6]; + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c5"} = sprintf "-:G:D:-:00 %s LDS.64 j%dIx0, [readIs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c11"} = sprintf "-:G:D:-:00 %s LDS.64 j%dIx2, [readIs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c17"} = sprintf "-:G:D:-:00 %s LDS.64 j%dIx4, [readIs + 4x<%d*128 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c59"} = sprintf "-:G:D:-:00 %s LDS.64 j%dIx6, [readIs + 4x<%d*128 + 80>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c23"} = sprintf "-:G:D:-:00 %s LDS.64 j%dFy0, [readFs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c29"} = sprintf "-:G:D:-:00 %s LDS.64 j%dFy2, [readFs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c35"} = sprintf "-:G:D:-:00 %s LDS.64 j%dFy4, [readFs + 4x<%d*128 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c41"} = sprintf "-:G:D:-:00 %s LDS.64 j%dFy6, [readFs + 4x<%d*128 + 96>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $ctrl = "-:-:-:-:00"; + + if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) { + $ins = "-:G:D:-:00 NOP;\n"; + } + + if ($c > 60 && !$ins){ + $ins = "-:-:D:-:07 NOP;\n"; + } + + # 04 and 05 are dual issued + if($ins) { + $ctrl = "-:-:D:-:04"; + } else { + if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){ + $ctrl = "-:-:D:-:04"; + } + else{ + $ctrl = "-:-:D:-:05"; + } + } + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +-:-:-:-:00 LDS.128 mpq, [RZ + addr_m]; +-:-:-:-:00 S2R tid, SR_TID.X; // 0-127 +-:-:-:-:00 S2R blkI, SR_CTAID.Z; // N id +-:-:-:-:00 S2R blkF, SR_CTAID.Y; // K id + +// tidOX = (tid & 7) << 2 + (tid & 128) >> 1 +// tidOY = (tid & 127) >> 3 +-:-:-:-:00 LOP.AND tidOX, tid, 7; +-:-:-:-:00 SHL tidOX, tidOX, 2; +-:-:-:-:00 LOP.AND tidOX2, tid, 128; +-:-:-:-:00 SHR.U32 tidOX2, tidOX2, 1; +-:-:-:-:00 LOP.OR tidOX, tidOX, tidOX2; +-:-:-:-:00 LOP.AND tidOY, tid, 127; +-:-:-:-:00 SHR.U32 tidOY, tidOY, 3; + +-:-:-:-:00 SHL readFs, readFs, 1; +-:-:-:-:00 SHL readIs, readIs, 1; +-:-:-:-:00 LOP.AND readIs, readIs, 0x1ff; +-:-:-:-:00 LOP.AND readFs, readFs, 0x0ff; + +// Div by 4 here collapses k stride +// writeCs = readFs * 16 + readIs; +-:-:-:-:00 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +-:-:-:-:00 ISCADD readCs, tidOY, tidOX, 7; +-:-:-:-:00 SHL readCs, readCs, 2; + +// n = blkI * 128 + tidOX; +-:-:-:-:00 ISCADD n, blkI, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF * 128 + tidOY * 4 +-:-:-:-:00 SHL tidOY, tidOY, 2; +-:-:-:-:00 ISCADD k, blkF, tidOY, 7; + +// o = k*MPQN + m*PQN + p*QN + q*N + n +-:-:-:-:00 IMAD to, q, param_N, n; +-:-:-:-:00 IMAD.U32.U32 to, p, param_QN, to; +-:-:-:-:00 IMAD.U32.U32 to, m, param_PQN, to; +-:-:-:-:00 IMAD.U32.U32 to, k, param_MPQN, to; +-:-:-:-:00 MOV tmp_param0, param_O[0]; +-:-:-:-:00 MOV tmp_param1, param_O[1]; +-:-:-:-:00 SHL tmp_shl, to, 0x2; +-:-:-:-:00 IADD Out0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X Out1, RZ, tmp_param1; + +-:-:-:-:00 MOV MPQN, param_MPQN; +-:-:-:-:00 SHL MPQN1, MPQN, 2; +-:-:-:-:00 SHL MPQN4, MPQN, 4; +-:-:-:-:00 ISCADD MPQN60, MPQN, -MPQN4, 8; + +-:-:-:-:00 ISETP.LT.AND P0, PT, n, param_N, PT; // n + 0 < N +-:-:-:-:00 IADD n, n, 32; +-:-:-:-:00 ISETP.LT.AND P1, PT, n, param_N, PT; // n + 32 < N + +-:-:-:-:00 MOV alpha, param_alpha; + +-:-:-:-:00 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + if ($y == 4) + { + $out .= sprintf( + "-:-:-:-:00 IADD Out0.CC, Out0, MPQN60;\n" . + "-:-:-:-:00 IADD k, k, 60;\n" . + "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n" . + "-:-:-:-:00 IADD.X Out1, Out1, RZ;\n\n", + ($y) x 8); + } + else + { + $out .= sprintf( + "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n\n", + ($y) x 8); + } + + $out .= "-:-:-:-:00 CAL STORE_C;\n\n"; + } + return $out; + + + +-:-:-:-:00 EXIT; + +STORE_C: + +-:-:-:-:00 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n + 0 < N +-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n + 32 < N +-:-:-:-:00 IADD k, k, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +-:-:-:-:00 STS.128 [writeCs + 4x<00>], cs0; +-:-:-:-:00 STS.128 [writeCs + 4x<32>], cs4; +-:-:-:-:00 LDS.128 cs0, [readCs + 4x<00>]; +-:-:-:-:00 LDS.128 cs4, [readCs + 4x<32>]; + +// Store results back to global +-:-:-:-:00 @P2 ST.E.128 [Out + 4x<00>], cs0; +-:-:-:-:00 @P3 ST.E.128 [Out + 4x<32>], cs4; + +-:-:-:-:00 IADD Out0.CC, Out0, MPQN1; +-:-:-:-:00 IADD.X Out1, Out1, RZ; + +-:-:-:-:00 RET; + diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.cu b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.cu new file mode 100644 index 0000000..ebfa963 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.cu @@ -0,0 +1,48 @@ +extern "C" +__global__ void sconv_fprop_K64_N64 ( + float* param_test, + float *param_O, + const float *param_I, + const float *param_F, + float param_alpha, + int param_N, + int param_K, + int param_D, + int param_H, + int param_W, + int param_WN, + int param_HWN, + int param_DHWN, + int param_C, + int param_KRST, + int param_RST, + int param_RS, + int param_magic_RS, + int param_shift_RS, + int param_S, + int param_magic_S, + int param_shift_S, + int param_pad_d, + int param_pad_h, + int param_pad_w, + int param_str_d, + int param_str_h, + int param_str_w, + int param_Q, + int param_PQ, + int param_QN, + int param_PQN, + int param_MPQN, + int param_magic_Q, + int param_shift_Q, + int param_magic_PQ, + int param_shift_PQ) { + __shared__ float share[64 * 8 * 4 + 8]; + + int tid = threadIdx.x; + + share[tid] = 1; + + *param_O = share[63-tid]; + *param_test = share[63-tid]; + } diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.sass b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.sass new file mode 100644 index 0000000..8db0438 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.sass @@ -0,0 +1,782 @@ +# Kernel: sconv_fprop_K64_N64 +// debug: +// mode1 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +//-:-:-:-:00 SHL tmp_shl, tid, 0x2; +//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +//-:-:-:-:00 I2F.F32.U32 rst, rst; +//-:-:-:-:00 ST.E [tmp_param00], rst; +//-:-:-:-:00 EXIT; + +// mode2 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +// +//-:-:-:-:00 MOV32I k, 0x40000000; +//-:-:-:-:00 ST.E [tmp_param0], k; +//-:-:-:-:00 EXIT; + +// modify steps: +// XMAD->IMAD +// shared memory addresses->RZ +// LDG->LD +// LEA->MOV, IADD, SHL +// XMAD.LO2C->IMAD.U32.U32 +// XMAD.PSL->IMAD.U32.U32 +// VMAD->IMAD, IADD +// MOV->MOV32I +// IADD3->IADD, IADD +// POPC +// ST.CG->ST +// control code +// comments +// LDS.U->LDS +// register<0-7>->register<0-3>, register<4-7> +// avoid register conflicts + +// optimization steps: +// alexnet2 +// initial->1200 +// bank conflict->1288 +// alignment+dual issue+reuse->1600 +// half ldg.128->1700 +// all ldg.128->1777 +// control codes->1900 +// scheduling->1937 +// reduce unnecessary instructions->2100 + + + addr_zero : 4x<64*8*4 + 0> + addr_m : 4x<64*8*4 + 4> + addr_p : 4x<64*8*4 + 5> + addr_q : 4x<64*8*4 + 6> + addr_szLut : 4x<64*8*4 + 7> + addr_lut : 4x<64*8*4 + 8> + + param_test[0] : c[0x0][0x140] + param_test[1] : c[0x0][0x144] + param_O[0] : c[0x0][0x148] + param_O[1] : c[0x0][0x14c] + param_I[0] : c[0x0][0x150] + param_I[1] : c[0x0][0x154] + param_F[0] : c[0x0][0x158] + param_F[1] : c[0x0][0x15c] + param_alpha : c[0x0][0x160] + param_N : c[0x0][0x164] + param_K : c[0x0][0x168] + param_D : c[0x0][0x16c] + param_H : c[0x0][0x170] + param_W : c[0x0][0x174] + param_WN : c[0x0][0x178] + param_HWN : c[0x0][0x17c] + param_DHWN : c[0x0][0x180] + param_C : c[0x0][0x184] + param_KRST : c[0x0][0x188] + param_RST : c[0x0][0x18c] + param_RS : c[0x0][0x190] + param_magic_RS : c[0x0][0x194] + param_shift_RS : c[0x0][0x198] + param_S : c[0x0][0x19c] + param_magic_S : c[0x0][0x1a0] + param_shift_S : c[0x0][0x1a4] + param_pad_d : c[0x0][0x1a8] + param_pad_h : c[0x0][0x1ac] + param_pad_w : c[0x0][0x1b0] + param_str_d : c[0x0][0x1b4] + param_str_h : c[0x0][0x1b8] + param_str_w : c[0x0][0x1bc] + param_Q : c[0x0][0x1c0] + param_PQ : c[0x0][0x1c4] + param_QN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_MPQN : c[0x0][0x1d0] + param_magic_Q : c[0x0][0x1d4] + param_shift_Q : c[0x0][0x1d8] + param_magic_PQ : c[0x0][0x1dc] + param_shift_PQ : c[0x0][0x1e0] + + + + + 64-67 : mpq<0-3> + 64-67 : m, p, q, tidY + 68-72 : blkF, blkI, blkMPQ, tid1, tidX + 73-95 ~ pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 : j0Fy<0-3> + 68-71 : j0Ix<0-3> + 72-75 : j0Fy<4-7> + 76-79 : j0Ix<4-7> + 80-83 : j1Fy<0-3> + 84-87 : j1Ix<0-3> + 88-91 : j1Fy<4-7> + 92-95 : j1Ix<4-7> + + 96-97 : trackI<0-1> + 98-99 : trackF<0-1> + + 100-103 : loadI<0-3> + 104-107 : loadF<0-3> + 108-111 : loadI<4-7> + 112-115 : loadF<4-7> + + 117 : readFs + 116 : readIs + 118-127 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI + + 128 : tmp_shl + 129 : tmp_data + + 130-131 : tmp_param<0-1> + 132 : p_and + 133 : tid + 134-135 : sliceI, sliceF + 134-135 : sliceIF<0-1> + 136-139 ~ offsetF, offsetIc, offsetFc + 140-143 ~ addressF0, addressF1, addressI0, addressI1 + 144-145 : tmp_param0<0-1> + + 72-79 : cs<0-7> + 80-81 : Out<0-1> + + 82-125 ~ writeCs, readCs, alpha, tidOX, tidOY, to, k, n, MPQN1, MPQN28, MPQN, MPQN4 + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkF, SR_CTAID.Y; #K64 +-:-:-:-:00 S2R blkI, SR_CTAID.Z; #N64 +-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index + +// if tid > 32 +// P0 = true +-:-:-:-:00 ISETP.GE.AND P0, PT, tid, 32, PT; + +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15; + + +// tid <= 63 +// tidX = (tid & 7) << 2 +// tidX = 0 : 8 : 255 +// tidY = tid >> 3 +// tidY = 0 : 1 : 7 +-:-:-:-:00 LOP.AND tidX, tid, 7; +-:-:-:-:00 SHL tidX, tidX, 2; +-:-:-:-:00 SHR.U32 tidY, tid, 3; + +// offsetFk += blkF * 64 + tidX +// K64 +// blkF ---- trackF +// tidX +-:-:-:-:00 ISCADD offsetFk, blkF, tidX, 6; + +// offsetIn += blkI * 64 + tidX +// N64 +// blkI ---- trackI +// tidX +-:-:-:-:00 ISCADD offsetIn, blkI, tidX, 6; + +// writeS = (64 * tidY + tidX) * 4 +// tidY = 0 : 1 : 7 +// tidX = 0 : 8 : 255 +// ---------------- +// ---------------- tidY 0 : 1 : 8 +// ---- writeS +// tidX +-:-:-:-:00 ISCADD writeS, tidY, tidX, 6; +-:-:-:-:00 SHL writeS, writeS, 2; + +// readFs = (((tid & 0x30) >> 3) | (tid & 1)) << 4; +// [0][5][4] * 16; +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 LOP.AND readFs, tid, 0x30; +-:-:-:-:00 SHR.U32 readFs, readFs, 3; +-:-:-:-:00 LOP.OR readFs, readFs, tid1; +-:-:-:-:00 SHL readFs, readFs, 4; + +// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +// [3][2][1] * 4 + 512 * 4; +-:-:-:-:00 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 ISCADD readIs, readIs, 4x<8*64>, 4; +-:-:-:-:00 MOV32I tmp_data, 128; + +-:-:-:-:00 @P0 BRA.U END_SETUP; + +-:-:-:-:00 MOV rst, tid; +-:-:-:-:00 MOV lutStore2, RZ; +-:-:-:-:00 MOV lutSize, RZ; +-:-:-:-:00 MOV32I warp_count, 32; + +// m = blkMPQ / PQ +// pq = blkMPQ % PQ +-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ; +-:-:-:-:00 SHR.U32 m, m, param_shift_PQ; +-:-:-:-:00 IMAD pq, m, param_PQ, RZ; +-:-:-:-:00 IADD pq, -pq, blkMPQ; +// p = pq / Q +// q = pq % Q +-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ; +-:-:-:-:00 SHR.U32 p, p, param_shift_Q; +-:-:-:-:00 IMAD q, p, param_Q, RZ; +-:-:-:-:00 IADD q, -q, pq; + +// dep_thd_mask = -1 +-:-:-:-:00 MOV32I dep_thd_mask, -1; + +// if p is odd +// set q = param_Q - q - 1 +// if p is even +// q = q +-:-:-:-:00 LOP.AND p_and, p, 1; +-:-:-:-:00 ISETP.NE.AND P1, PT, p_and, RZ, PT; +-:-:-:-:00 @P1 IADD q, -q, param_Q; +-:-:-:-:00 @P1 IADD q, q, dep_thd_mask; + +-:-:-:-:00 STS.128 [RZ + addr_m], m; + +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +-:-:-:-:00 IMAD qs, q, param_str_w, RZ; +-:-:-:-:00 IMAD pr, p, param_str_h, RZ; +-:-:-:-:00 IMAD mt, m, param_str_d, RZ; +-:-:-:-:00 IADD qs, qs, -param_pad_w; +-:-:-:-:00 IADD pr, pr, -param_pad_h; +-:-:-:-:00 IADD mt, mt, -param_pad_d; + +// mask_shr = 32 - tid +// dep_thd_mask = dep_thd_mask >> mask_shr +-:-:-:-:00 IADD mask_shr, -tid, 32; +-:-:-:-:00 SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr; + +LUT_LOOP: + +// warp synchronous loop while warp_count < RST +-:-:-:-:00 ISETP.LT.AND P0, PT, warp_count, param_RST, PT; +-:-:-:-:00 IADD warp_count, warp_count, 32; +// t = rst / RS +// rs = rst % RS +-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ; +-:-:-:-:00 SHR.U32 t, t, param_shift_RS; +-:-:-:-:00 IMAD rs, t, param_RS, RZ; +-:-:-:-:00 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ; +-:-:-:-:00 SHR.U32 r, r, param_shift_S; +-:-:-:-:00 IMAD s, r, param_S, RZ; +-:-:-:-:00 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +-:-:-:-:00 IADD x, qs, s; +-:-:-:-:00 IADD y, pr, r; +-:-:-:-:00 IADD z, mt, t; +-:-:-:-:00 ISETP.GE.AND P4, PT, x, RZ, PT; +-:-:-:-:00 ISETP.GE.AND P5, PT, y, RZ, PT; +-:-:-:-:00 ISETP.GE.AND P6, PT, z, RZ, PT; +-:-:-:-:00 ISETP.LT.AND P4, PT, x, param_W, P4; +-:-:-:-:00 ISETP.LT.AND P5, PT, y, param_H, P5; +-:-:-:-:00 ISETP.LT.AND P6, PT, z, param_D, P6; +-:-:-:-:00 PSETP.AND.AND P1, PT, P4, P5, P6; + +// sliceI = z*HWN + y*WN + x*N +// rst N +// -------------- -------------- +// -------------- -------------- +// -------------- -------------- +// -------------- K * rst -------------- +// -------------- -------------- +// -------------- -------------- +-:-:-:-:00 IMAD sliceI, x, param_N, RZ; +-:-:-:-:00 IMAD.U32.U32 sliceI, y, param_WN, sliceI; +-:-:-:-:00 IMAD.U32.U32 sliceI, z, param_HWN, sliceI; + +// sliceF = rst * K +-:-:-:-:00 IMAD sliceF, rst, param_K, RZ; + +// Get a mask of all valid slices in the warp +-:-:-:-:00 VOTE.ANY ballot, PT, P1; +// Count the total valid slices +-:-:-:-:00 POPC warp_slices, ballot, ballot; +// Prepare lutStore for this and next loop +// lutStore = lutStore2 +// lutStore2 = warp_slices * 8 + lutStore2 +-:-:-:-:00 @P1 MOV lutStore, lutStore2; +-:-:-:-:00 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +// bit(dep_thd_mask) = tid +// bit(ballot) = valid tid +// dep_thd_cnt = number of bit below ballot +-:-:-:-:00 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +-:-:-:-:00 @P1 POPC dep_thd_cnt, dep_thd_bits, dep_thd_bits; +// use the rst increment to space the barrier sync +// rst = rst + 32 +-:-:-:-:00 IADD rst, rst, 32; +// Update the lutStore address from this count +// lutStore = dep_thd_cnt * 8 + lutStore +-:-:-:-:00 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +-:-:-:-:00 @P1 STS.64 [lutStore + addr_lut], sliceIF; + +// Keep track of the total size of the lut +// lutSize = lutSize + warp_slices +-:-:-:-:00 IADD lutSize, lutSize, warp_slices; + +-:-:-:-:00 @P0 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +-:-:-:-:00 STS [RZ + addr_szLut], lutSize; + +// if tid >= 32, directly enter it +END_SETUP: + +-:-:-:-:00 BAR.SYNC 0; + +// Grab the caclulated lut size and get it's reciprical +// Get the total reduction depth +-:-:-:-:00 LDS lutSize, [RZ + addr_szLut]; + +// endCRST = lutSize * param_C (channel) +-:-:-:-:00 IMAD endCRST, lutSize, param_C, RZ; +// lutSizeRcp = 1 / lutSize +-:-:-:-:00 I2F.F32.S32 lutSizeRcp, lutSize; +-:-:-:-:00 MUFU.RCP lutSizeRcp, lutSizeRcp; + +// posCRST = endCRST - tidY - 1 +-:-:-:-:00 IADD posCRST, endCRST, -1; +-:-:-:-:00 IADD posCRST, posCRST, -tidY; + +// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch. +// If it is a multiple of 8 then make a full 8 line fetch. +-:-:-:-:00 LOP.AND partial, endCRST, 7; +-:-:-:-:00 ISETP.EQ.AND P1, PT, RZ, partial, PT; +// If partial == 0 +-:-:-:-:00 @P1 MOV32I partial, 8; +// channel = lower(posCRST / lutSize) +// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it +-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST; +-:-:-:-:00 FMUL channel, posCRSTf, lutSizeRcp; +-:-:-:-:00 FFMA channel, channel, 5.9604644775390625e-08, channel; +-:-:-:-:00 F2I.S32.F32.TRUNC channel, channel; + +// lutOffset = (posCRST % lutSize) * 8 +-:-:-:-:00 IMAD tmp_param0, channel, lutSize, RZ; +-:-:-:-:00 IADD lutOffset, -tmp_param0, posCRST; +-:-:-:-:00 SHL lutOffset, lutOffset, 3; +// P1 = tidY < partial +-:-:-:-:00 ISETP.LT.AND P1, PT, tidY, partial, PT; + +// offsetIC = channel * DHWN +// offsetFC = channel * K +-:-:-:-:00 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ; +-:-:-:-:00 IMAD offsetFc, channel, param_KRST, RZ; +// posCRST -= partial +-:-:-:-:00 IADD posCRST, posCRST, -partial; +-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut]; + +// trackF = offsetFK + offsetFC + sliceF + param_F +-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc; +-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF; +// trackI = offsetIN + offsetIC + sliceI + param_I +-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc; +-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI; + +//-:-:-:-:00 @P1 LEA trackF0.CC, offsetF, param_F[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2; +-:-:-:-:00 MOV addressF0, param_F[0]; +-:-:-:-:00 MOV addressF1, param_F[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2; +-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, addressF0; +-:-:-:-:00 @P1 IADD.X trackF1, RZ, addressF1; +//-:-:-:-:00 @P1 LEA trackI0.CC, offsetI, param_I[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2; +-:-:-:-:00 MOV addressI0, param_I[0]; +-:-:-:-:00 MOV addressI1, param_I[1]; +-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2; +-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, addressI0; +-:-:-:-:00 @P1 IADD.X trackI1, RZ, addressI1; + +-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF + 4x<0>]; +-:-:-:-:00 @P1 LD.E.128 loadF4, [trackF + 4x<32>]; +-:-:-:-:00 @!P1 LDS.128 loadF0, [RZ + addr_zero]; +-:-:-:-:00 @!P1 LDS.128 loadF4, [RZ + addr_zero]; + +-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI + 4x<0>]; +-:-:-:-:00 @P1 LD.E.128 loadI4, [trackI + 4x<32>]; +-:-:-:-:00 @!P1 LDS.128 loadI0, [RZ + addr_zero]; +-:-:-:-:00 @!P1 LDS.128 loadI4, [RZ + addr_zero]; + +-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT; + +-:-:-:-:00 STS.128 [writeS + 4x<0*64 + 0>], loadF0; +-:-:-:-:00 STS.128 [writeS + 4x<0*64 + 32>], loadF4; + +-:-:-:-:00 STS.128 [writeS + 4x<8*64 + 0>], loadI0; +-:-:-:-:00 STS.128 [writeS + 4x<8*64 + 32>], loadI4; + +-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST; + +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 LOP.XOR writeS, writeS, 4x<64*8*2>; + +-:-:-:-:00 LDS.128 j0Ix0, [readIs + 4x<0*64 + 00>]; +-:-:-:-:00 LDS.128 j0Fy0, [readFs + 4x<0*64 + 00>]; + +-:-:-:-:00 LDS.128 j0Ix4, [readIs + 4x<0*64 + 32>]; +-:-:-:-:00 LDS.128 j0Fy4, [readFs + 4x<0*64 + 32>]; + +// channel = posCRST / lutSize +-:-:-:-:00 @P1 FMUL channel, posCRSTf, lutSizeRcp; +-:-:-:-:00 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel; +-:-:-:-:00 @P1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +-:-:-:-:00 @P1 IMAD tmp_param0, channel, lutSize, RZ; +-:-:-:-:00 @P1 IADD lutOffset, -tmp_param0, posCRST; +-:-:-:-:00 @P1 SHL lutOffset, lutOffset, 3; +// offsetIC = channel * DHWN +// offsetFC = channel * K +-:-:-:-:00 @P1 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ; +-:-:-:-:00 @P1 IMAD offsetFc, channel, param_KRST, RZ; + +-:-:-:-:00 IADD posCRST, posCRST, -8; +-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut]; + +// trackI = offsetIN + offsetIC + sliceI + param_I +// trackF = offsetFK + offsetFC + sliceF + param_F +-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc; +-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF; +-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc; +-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI; + +//-:-:-:-:00 @P1 LEA trackF0.CC, offsetF, param_F[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2; +-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2; +-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, addressF0; +-:-:-:-:00 @P1 IADD.X trackF1, RZ, addressF1; + +//-:-:-:-:00 @P1 LEA trackI0.CC, offsetI, param_I[0], 2; +//-:-:-:-:00 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2; +-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2; +-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, addressI0; +-:-:-:-:00 @P1 IADD.X trackI1, RZ, addressI1; + +-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF + 4x< 0>]; +-:-:-:-:00 @P1 LD.E.128 loadF4, [trackF + 4x<32>]; +-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI + 4x< 0>]; +-:-:-:-:00 @P1 LD.E.128 loadI4, [trackI + 4x<32>]; + +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + +LOOP: + + + my %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c53 => "-:-:-:-:00 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + j0c61 => "-:-:-:-:00 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c62 => "-:-:-:-:00 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j1c47 => "-:-:-:-:00 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j1c53 => "-:-:-:-:00 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + j1c61 => "-:-:-:-:00 \@P1 IMAD lutOffset, -channel, lutSize, posCRST;\n", + j1c62 => "-:-:-:-:00 \@P1 IMAD offsetF, channel, param_KRST, offsetFk;\n", + j1c63 => "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 \@P1 SHL lutOffset, lutOffset, 3;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 IADD posCRST, posCRST, -8;\n". + "-:G:D:-:00 \@P1 LDS.64 sliceIF, [lutOffset + addr_lut];\n", + + j2c47 => "-:-:-:-:00 \@P1 IMAD.U32.U32 offsetI, channel, param_DHWN, offsetIn;\n", + j2c53 => "-:-:-:-:00 TEXDEPBAR 0x0;\n", + j2c61 => "-:-:-:-:00 \@P1 IADD offsetF, offsetF, sliceF;\n", + j2c62 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*64 + 0>], loadI0;\n", + j2c63 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*64 + 32>], loadI4;\n", + + j3c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetF, 0x2;\n", + j3c53 => "-:-:-:-:00 \@P1 IADD offsetI, offsetI, sliceI;\n", + j3c61 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*64 + 0>], loadF0;\n", + j3c62 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*64 + 32>], loadF4;\n", + j3c63 => "-:-:-:-:00 \@P1 IADD trackF0.CC, tmp_shl, addressF0;\n", + + j4c47 => "-:-:-:-:00 \@P1 IADD.X trackF1, RZ, addressF1;\n", + j4c53 => "-:-:-:-:00 \@P1 IADD tmp_param0.CC, tmp_data, trackF0;\n", + j4c61 => "-:-:-:-:00 \@P1 IADD.X tmp_param1, RZ, trackF1;\n", + + j5c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetI, 0x2;\n", + + j5c61 => "-:-:-:-:00 \@P1 IADD trackI0.CC, tmp_shl, addressI0;\n", + j5c62 => "-:G:D:-:00 \@P1 LDG.E.128 loadF0, [trackF + 4x< 0>];\n", + j5c63 => "-:G:D:-:00 \@P1 LDG.E.128 loadF4, [tmp_param];\n", + + j6c47 => "-:-:-:-:00 \@P1 IADD.X trackI1, RZ, addressI1;\n", + j6c53 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n", + + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n", + j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n", + + j7c47 => "-:-:-:-:00 \@P1 IADD tmp_param0.CC, tmp_data, trackI0;\n", + j7c53 => "-:-:-:-:00 \@P1 IADD.X tmp_param1, RZ, trackI1;\n", + j7c61 => "-:G:D:-:00 \@P1 LDG.E.128 loadI0, [trackI];\n", + j7c62 => "-:G:D:-:00 \@P1 LDG.E.128 loadI4, [tmp_param];\n", + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n", + ); + + my @cOrder; + + push @cOrder, [0,0]; + push @cOrder, [0,1]; + push @cOrder, [1,1]; + push @cOrder, [2,0]; + push @cOrder, [1,0]; + push @cOrder, [2,1]; + push @cOrder, [2,3]; + push @cOrder, [2,2]; + push @cOrder, [1,2]; + push @cOrder, [0,3]; + push @cOrder, [1,3]; + push @cOrder, [0,2]; + push @cOrder, [0,4]; + push @cOrder, [0,5]; + push @cOrder, [1,5]; + push @cOrder, [2,4]; + push @cOrder, [1,4]; + push @cOrder, [2,5]; + push @cOrder, [2,7]; + push @cOrder, [2,6]; + push @cOrder, [1,6]; + push @cOrder, [0,7]; + push @cOrder, [1,7]; + push @cOrder, [0,6]; + push @cOrder, [3,6]; + push @cOrder, [3,7]; + push @cOrder, [4,7]; + push @cOrder, [5,6]; + push @cOrder, [4,6]; + push @cOrder, [5,7]; + push @cOrder, [5,5]; + push @cOrder, [5,4]; + push @cOrder, [4,4]; + push @cOrder, [3,5]; + push @cOrder, [4,5]; + push @cOrder, [3,4]; + push @cOrder, [3,2]; + push @cOrder, [3,3]; + push @cOrder, [4,3]; + push @cOrder, [5,2]; + push @cOrder, [4,2]; + push @cOrder, [5,3]; + push @cOrder, [5,1]; + push @cOrder, [5,0]; + push @cOrder, [4,0]; + push @cOrder, [3,1]; + push @cOrder, [4,1]; + push @cOrder, [3,0]; + push @cOrder, [6,0]; + push @cOrder, [7,0]; + push @cOrder, [7,1]; + push @cOrder, [6,2]; + push @cOrder, [6,1]; + push @cOrder, [7,2]; + push @cOrder, [7,5]; + push @cOrder, [6,5]; + push @cOrder, [6,4]; + push @cOrder, [7,3]; + push @cOrder, [7,4]; + push @cOrder, [6,3]; + push @cOrder, [6,6]; + push @cOrder, [6,7]; + push @cOrder, [7,7]; + push @cOrder, [7,6]; + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c5"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx0, [readIs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx2, [readIs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx4, [readIs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx6, [readIs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy2, [readFs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy6, [readFs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $ctrl = "-:-:-:-:00"; + + if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) { + $ins = "-:G:D:-:00 NOP;\n"; + } + + if ($c > 60 && !$ins){ + $ins = "-:-:D:-:07 NOP;\n"; + } + + # 04 and 05 are dual issued + if($ins) { + $ctrl = "-:-:D:-:04"; + } else { + if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){ + $ctrl = "-:-:D:-:04"; + } + else{ + $ctrl = "-:-:D:-:05"; + } + } + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +-:-:-:-:00 LDS.128 mpq, [RZ + addr_m]; +-:-:-:-:00 S2R tid, SR_TID.X; // 0-64 +-:-:-:-:00 S2R blkI, SR_CTAID.Z; // N id +-:-:-:-:00 S2R blkF, SR_CTAID.Y; // K id + +// tidOX = (tid & 7) << 2 +// tidOY = tid >> 3 +-:-:-:-:00 LOP.AND tidOX, tid, 7; +-:-:-:-:00 SHL tidOX, tidOX, 2; +-:-:-:-:00 SHR.U32 tidOY, tid, 3; + +-:-:-:-:00 LOP.AND readIs, readIs, 0x7ff; +-:-:-:-:00 LOP.AND readFs, readFs, 0x7ff; + +// Div by 4 here collapses k stride +// writeCs = readFs * 16 + readIs; +-:-:-:-:00 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +-:-:-:-:00 ISCADD readCs, tidOY, tidOX, 6; +-:-:-:-:00 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +-:-:-:-:00 ISCADD n, blkI, tidOX, 6; + +// Mul by 4 here expands k stride back out +// k = blkF*64 + tidOY * 4 +-:-:-:-:00 SHL tidOY, tidOY, 2; +-:-:-:-:00 ISCADD k, blkF, tidOY, 6; + +// o = k*MPQN + m*PQN + p*QN + q*N + n +-:-:-:-:00 IMAD to, q, param_N, n; +-:-:-:-:00 IMAD.U32.U32 to, p, param_QN, to; +-:-:-:-:00 IMAD.U32.U32 to, m, param_PQN, to; +-:-:-:-:00 IMAD.U32.U32 to, k, param_MPQN, to; +-:-:-:-:00 MOV tmp_param0, param_O[0]; +-:-:-:-:00 MOV tmp_param1, param_O[1]; +-:-:-:-:00 SHL tmp_shl, to, 0x2; +-:-:-:-:00 IADD Out0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X Out1, RZ, tmp_param1; + +-:-:-:-:00 MOV MPQN, param_MPQN; +-:-:-:-:00 SHL MPQN1, MPQN, 2; +-:-:-:-:00 SHL MPQN4, MPQN, 4; +-:-:-:-:00 ISCADD MPQN28, MPQN, -MPQN4, 7; + +-:-:-:-:00 ISETP.LT.AND P0, PT, n, param_N, PT; // n + 0 < N +-:-:-:-:00 IADD n, n, 32; +-:-:-:-:00 ISETP.LT.AND P1, PT, n, param_N, PT; // n + 32 < N + +-:-:-:-:00 MOV alpha, param_alpha; + +-:-:-:-:00 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + if ($y == 4) + { + $out .= sprintf( + "-:-:-:-:00 IADD Out0.CC, Out0, MPQN28;\n" . + "-:-:-:-:00 IADD k, k, 28;\n" . + "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n" . + "-:-:-:-:00 IADD.X Out1, Out1, RZ;\n\n", + ($y) x 8); + } + else + { + $out .= sprintf( + "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n\n", + ($y) x 8); + } + + $out .= "-:-:-:-:00 CAL STORE_C;\n\n"; + } + return $out; + + + +-:-:-:-:00 EXIT; + +STORE_C: + +-:-:-:-:00 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n + 0 < N +-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n + 32 < N +-:-:-:-:00 IADD k, k, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +-:-:-:-:00 STS.128 [writeCs + 4x<00>], cs0; +-:-:-:-:00 STS.128 [writeCs + 4x<32>], cs4; +-:-:-:-:00 LDS.128 cs0, [readCs + 4x<00>]; +-:-:-:-:00 LDS.128 cs4, [readCs + 4x<32>]; + +// Store results back to global +-:-:-:-:00 @P2 ST.E.128 [Out + 4x<00>], cs0; +-:-:-:-:00 @P3 ST.E.128 [Out + 4x<32>], cs4; + +-:-:-:-:00 IADD Out0.CC, Out0, MPQN1; +-:-:-:-:00 IADD.X Out1, Out1, RZ; + +-:-:-:-:00 RET; + diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin new file mode 100644 index 0000000..999e19e Binary files /dev/null and b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin differ diff --git a/Kernel/Convolution/Kepler/sconv_update.cu b/Kernel/Convolution/Kepler/sconv_update.cu new file mode 100644 index 0000000..ab88d60 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_update.cu @@ -0,0 +1,149 @@ +#include "sconv.h" + +bool update(const float *I, float *F, const float *O, + unsigned int N, unsigned int C, unsigned int K, + unsigned int D, unsigned int H, unsigned int W, + unsigned int R, unsigned int S, unsigned int T, + unsigned int M, unsigned int P, unsigned int Q, + unsigned int str_d, unsigned int str_h, unsigned int str_w, + unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) { + float alpha = 1.0f; + unsigned int DHW, WN, HW, HWN, DHWN, CRST, RST, RS; + unsigned int PQ, QN, PQN, MPQN; + unsigned int magic_HW, magic_W; + unsigned int shift_HW, shift_W; + unsigned int magic_RST, magic_RS, magic_S; + unsigned int shift_RST, shift_RS, shift_S; + unsigned int magic_PQu, shift_PQu; + unsigned int magic_Qu, shift_Qu; + unsigned int grid_P = 1; + unsigned int grid_Q = 1; + unsigned int grid_PQ = grid_P * grid_Q; + unsigned int grid_PQM = grid_PQ * M; + // input + WN = W * N; + HW = H * W; + HWN = H * WN; + DHW = D * HW; + DHWN = D * HWN; + // filter + RS = R * S; + RST = T * RS; + CRST = C * RS; + // output + QN = Q * N; + PQN = P * QN; + MPQN = M * PQN; + // magic numbers + magic32(CRST, RST, magic_RST, shift_RST); + magic32(RST + 32, RS, magic_RS, shift_RS); + magic32(RS + 32, S, magic_S, shift_S); + magic32(DHW, HW, magic_HW, shift_HW); + magic32(HW, W, magic_W, shift_W); + magic32(grid_PQM, grid_PQ, magic_PQu, shift_PQu); + magic32(grid_PQ, grid_Q, magic_Qu, shift_Qu); + std::cout << "CRST: " << CRST << std::endl; + // test param set up + float *test_param; + cudaError_t cuda_error; + cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024); + cudaMemset(test_param, 0, sizeof(float) * 1024); + void *args[43] = {&test_param, &F, &I, &O, &alpha, + &N, &K, &D, &H, &W, &WN, &HWN, &DHWN, + &C, &CRST, + &RST, &magic_RST, &shift_RST, + &RS, &magic_RS, &shift_RS, + &S, &magic_S, &shift_S, + &pad_d, &pad_h, &pad_w, + &str_d, &str_h, &str_w, + &P, &Q, &PQ, &QN, &PQN, &MPQN, + &magic_Qu, &shift_Qu, + &magic_PQu, &shift_PQu, + &grid_P, &grid_Q, &grid_PQ}; + int gridX = grid_PQM; + int gridY = CRST / 128 + (CRST % 128 != 0); + int gridZ = K / 128 + (K % 128 != 0); + std::string kernel_name = "sconv_update_C128_K128"; + CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 256, 1, 1, + 0, 0, args, NULL); + if (res != CUDA_SUCCESS) { + std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl; + return false; + } + cuCtxSynchronize(); + float* h_test = (float *)malloc(sizeof(float) * 256); + cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 256, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 256; ++i) { + std::cout << h_test[i] << " "; + } + std::cout << std::endl; + // free test_param + free(h_test); + return true; +} + +int main() { + cudaFree(0); + float *d_I, *d_F, *d_O; + unsigned int N = 128, C = 3, K = 128, D = 1, H = 224, W = 224, T = 1, R = 11, S = 11; + unsigned int str_d = 1, str_h = 4, str_w = 4; + unsigned int pad_d = 0, pad_h = 3, pad_w = 3; + unsigned int M, P, Q; + cudaError_t cuda_error; + M = (D - T + 2 * pad_d) / str_d + 1; + P = (H - R + 2 * pad_h) / str_h + 1; + Q = (W - S + 2 * pad_w) / str_w + 1; + float *h_O = (float *)malloc(K * M * P * Q * N * sizeof(float)); + for (int i = 0; i < K * M * P * Q * N; ++i) { + h_O[i] = 1; + } + float *h_I = (float *)malloc(C * D * H * W * N * sizeof(float)); + for (int i = 0; i < C * D * H * W * N; ++i) { + h_I[i] = 1; + } + float* h_F = (float *)malloc(sizeof(float) * C * R * S * T * K); + // device memory + cudaMalloc((void**)&d_I, sizeof(float) * C * D * H * W * N); + cudaMalloc((void**)&d_F, sizeof(float) * C * R * S * T * K); + cudaMalloc((void**)&d_O, sizeof(float) * K * M * P * Q * N); + // memcpy h_I, h_O + cudaMemcpy(d_I, h_I, sizeof(float) * C * D * H * W * N, + cudaMemcpyHostToDevice); + cudaMemcpy(d_O, h_O, sizeof(float) * K * M * P * Q * N, + cudaMemcpyHostToDevice); + // load kernels + if (!load_kernels("./")) { + std::cerr << "Couldn't load all kernels" << std::endl; + exit(1); + } + // launch kernel + if (!update(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) { + std::cerr << "Launch error" << std::endl; + exit(1); + } + // output + std::cout << "result" << std::endl; + cuda_error = cudaMemcpy(h_F, d_F, sizeof(float) * C * R * S * T * K, cudaMemcpyDeviceToHost); + if (cuda_error != cudaSuccess) { + std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; + exit(1); + } + for (int i = 0; i < 128; ++i) { + std::cout << h_F[i] << " "; + } + std::cout << std::endl; + // free memory + free(h_O); + free(h_I); + free(h_F); + cudaFree(d_I); + cudaFree(d_F); + cudaFree(d_O); + // run successfully + std::cout << "finish" << std::endl; + return 0; +} diff --git a/Kernel/Convolution/Kepler/sconv_update_C128_K128.cu b/Kernel/Convolution/Kepler/sconv_update_C128_K128.cu new file mode 100644 index 0000000..c8f3e35 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_update_C128_K128.cu @@ -0,0 +1,54 @@ +extern "C" +__global__ void sconv_update_C128_K128 ( + float* param_test, + float* param_F, + const float* param_I, + const float* param_E, + float param_alpha, + int param_N, + int param_K, + int param_D, + int param_H, + int param_W, + int param_WN, + int param_HWN, + int param_DHWN, + int param_C, + int param_CRST, + int param_RST, + int param_magic_RST, + int param_shift_RST, + int param_RS, + int param_magic_RS, + int param_shift_RS, + int param_S, + int param_magic_S, + int param_shift_S, + int param_pad_d, + int param_pad_h, + int param_pad_w, + int param_str_d, + int param_str_h, + int param_str_w, + int param_P, + int param_Q, + int param_PQ, + int param_QN, + int param_PQN, + int param_MPQN, + int param_magic_Q, + int param_shift_Q, + int param_magic_PQ, + int param_shift_PQ, + int param_part_P, + int param_part_Q, + int param_part_PQ) { + __shared__ float share[(128 * 16 + 32) * 4 + 6]; + + int tid = threadIdx.x; + + share[tid] = 1; + + *param_F = share[255 - tid]; + *param_test = share[255 - tid]; + } diff --git a/Kernel/Convolution/Kepler/sconv_update_C128_K128.sass b/Kernel/Convolution/Kepler/sconv_update_C128_K128.sass new file mode 100644 index 0000000..38f0810 --- /dev/null +++ b/Kernel/Convolution/Kepler/sconv_update_C128_K128.sass @@ -0,0 +1,720 @@ +# Kernel: sconv_update_C128_K128 +// debug: +// mode1 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +//-:-:-:-:00 SHL tmp_shl, tid, 0x2; +//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +//-:-:-:-:00 I2F.F32.U32 rst, rst; +//-:-:-:-:00 ST.E [tmp_param00], rst; +//-:-:-:-:00 EXIT; + +// mode2 +//-:-:-:-:00 MOV tmp_param0, param_test[0]; +//-:-:-:-:00 MOV tmp_param1, param_test[1]; +// +//-:-:-:-:00 MOV32I k, 0x40000000; +//-:-:-:-:00 ST.E [tmp_param0], k; +//-:-:-:-:00 EXIT; + +// modify steps: +// XMAD->IMAD +// shared memory addresses->RZ +// LDG->LD +// LEA->MOV, IADD, SHL +// XMAD.LO2C->IMAD.U32.U32 +// XMAD.PSL->IMAD.U32.U32 +// VMAD->IMAD, IADD +// MOV->MOV32I +// IADD3->IADD, IADD +// POPC +// ST.CG->ST +// control code +// comments +// LDS.U->LDS +// LOP3 +// register<0-7>->register<0-3>, register<4-7> +// avoid register conflicts +// PT: 0xffffff + +// initial->1200 +// bank conflict->1288 +// alignment+dual issue+reuse->1600 +// half ldg.128->1700 +// all ldg.128->1777 +// control codes->1900 +// scheduling->1937 +// reduce unnecessary instructions->2100 + + + addr_zero : 4x<(128 * 16 + 32) * 4 + 0> + addr_m : 4x<(128 * 16 + 32) * 4 + 4> + addr_q : 4x<(128 * 16 + 32) * 4 + 5> + szBuf : (128 * 16 + 32) + + param_test[0] : c[0x0][0x140] + param_test[1] : c[0x0][0x144] + param_F[0] : c[0x0][0x148] + param_F[1] : c[0x0][0x14c] + param_I[0] : c[0x0][0x150] + param_I[1] : c[0x0][0x154] + param_E[0] : c[0x0][0x158] + param_E[1] : c[0x0][0x15c] + param_alpha : c[0x0][0x160] + param_N : c[0x0][0x164] + param_K : c[0x0][0x168] + param_D : c[0x0][0x16c] + param_H : c[0x0][0x170] + param_W : c[0x0][0x174] + param_WN : c[0x0][0x178] + param_HWN : c[0x0][0x17c] + param_DHWN : c[0x0][0x180] + param_C : c[0x0][0x184] + param_CRST : c[0x0][0x188] + param_RST : c[0x0][0x18c] + param_magic_RST : c[0x0][0x190] + param_shift_RST : c[0x0][0x194] + param_RS : c[0x0][0x198] + param_magic_RS : c[0x0][0x19c] + param_shift_RS : c[0x0][0x1a0] + param_S : c[0x0][0x1a4] + param_magic_S : c[0x0][0x1a8] + param_shift_S : c[0x0][0x1ac] + param_pad_d : c[0x0][0x1b0] + param_pad_h : c[0x0][0x1b4] + param_pad_w : c[0x0][0x1b8] + param_str_d : c[0x0][0x1bc] + param_str_h : c[0x0][0x1c0] + param_str_w : c[0x0][0x1c4] + param_P : c[0x0][0x1c8] + param_Q : c[0x0][0x1cc] + param_PQ : c[0x0][0x1d0] + param_QN : c[0x0][0x1d4] + param_PQN : c[0x0][0x1d8] + param_MPQN : c[0x0][0x1dc] + param_magic_Q : c[0x0][0x1e0] + param_shift_Q : c[0x0][0x1e4] + param_magic_PQ : c[0x0][0x1e8] + param_shift_PQ : c[0x0][0x1ec] + param_grid_P : c[0x0][0x1f0] + param_grid_Q : c[0x0][0x1f4] + param_grid_PQ : c[0x0][0x1f8] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ blkI, blkE + 68-111 ~ tidX, tidY, tid1, tid7, tid128, shiftX, blkMPQ, pq, m + + 64-95 ~ tidYY, mm, mt, pr, y, z, y0, yH, z0, zD, bounds_yz, c, r, t, rs, rst + 64-95 ~ qs, x, x0, xW, bounds_x, ti, te, Q + + 64-67 : j0Ex<0-3> + 68-71 : j0Iy<0-3> + 72-75 : j0Ex<4-7> + 76-79 : j0Iy<4-7> + 80-83 : j1Ex<0-3> + 84-87 : j1Iy<0-3> + 88-91 : j1Ex<4-7> + 92-95 : j1Iy<4-7> + + 96-99 : loadI<0-3> + 100-103 : loadE<0-3> + 104-107 : loadI<4-7> + 108-111 : loadE<4-7> + + 112-115 : trackI<0-1>, trackE<0-1> + + 116-124 ~ writeS, loopN, e, i, p, q, k, crst, s + 125-127 : swapBuf, readIs, readEs + 128-129 : tmp_data, tmp_shl + 130-131 : tmp_param0, tmp_param1 + 132 : p_and + 133 : tid + 134-135 : tmp_param0<0-1> + + 68-83 : c<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 84-124 ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128 + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; // M +-:-:-:-:00 S2R blkI, SR_CTAID.Y; // CRST / 128 +-:-:-:-:00 S2R blkE, SR_CTAID.Z; // K / 128 + +// tidX = tid >> 1 +// tidX = 0 : 1 : 128 +// tidY = (tid & 1) << 2 +// tidY = 0, 4 +// shiftX = (tid & 1) << 4 +// shiftX = 0, 16 +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 SHR.U32 tidX, tid, 1; +-:-:-:-:00 SHL tidY, tid1, 2; +-:-:-:-:00 SHL shiftX, tid1, 4; + +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + + return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15; + + +-:-:-:-:00 PSETP.AND.AND P0, PT, PT, PT, PT; + +// m = blkMPQ / PQ +// pq = blkMPQ % PQ +// m = 0 : 1 : M - 1; +// PQ = 1; +// pq = 0 +-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ; +-:-:-:-:00 SHR.U32 m, m, param_shift_PQ; +-:-:-:-:00 IMAD pq, m, param_grid_PQ, RZ; +-:-:-:-:00 IADD pq, -pq, blkMPQ; +// p = pq / Q +// q = pq % Q +// p = 0 +// q = 0 +-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ; +-:-:-:-:00 SHR.U32 p, p, param_shift_Q; +-:-:-:-:00 IMAD q, p, param_grid_Q, RZ; +-:-:-:-:00 IADD q, -q, pq; + +// We need to be able to restore m and q at each P iteration +// Register spill to shared +-:-:-:-:00 STS [RZ + addr_m], m; +-:-:-:-:00 STS [RZ + addr_q], q; + +// tidX = 0 : 1 : 127 +// tidY = 0, 4 +// shiftX = 0, 16 +// writeS <= (512 + 128 + 16) * 4 +// if tidY > 512, shiftX = 16 +// writeS = (tidY * 128 + tidX + shiftX) * 4 + szBuf * 8 +// -------------------- +// -------------------- +// 0, 4 -------------------- +// tidY -------------------- +// ---- tidX = 0 : 1 : 127 + +-:-:-:-:00 ISCADD writeS, tidY, tidX, 7; +-:-:-:-:00 IADD writeS, writeS, shiftX; +-:-:-:-:00 ISCADD writeS, writeS, 4x, 2; + +// readIs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +// [6][5][4][0] * 16; +// readIs = 0 : 4 : 63 +-:-:-:-:00 LOP.AND readIs, tid, 0x70; +-:-:-:-:00 SHR.U32 readIs, readIs, 3; +-:-:-:-:00 LOP.OR readIs, readIs, tid1; +-:-:-:-:00 SHL readIs, readIs, 4; + +// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf; +// [7][3][2][1] * 16 + szBuf * 4; +// readEs = 0 : 4 : 63 +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 SHR.U32 readEs, tid128, 4; +-:-:-:-:00 LOP.OR readEs, readEs, tid7; +-:-:-:-:00 ISCADD readEs, readEs, 4x, 4; + +-:-:-:-:00 MOV32I swapBuf, -4x; + +// crst = blockI * 128 + tidX +-:-:-:-:00 ISCADD crst, blkI, tidX, 7; + +// k = blockE * 128 + tidX +-:-:-:-:00 ISCADD k, blkE, tidX, 7; + +// loopN = N +-:-:-:-:00 MOV loopN, param_N; + +NEXT_P: + +// tidYY = 0 : 1 : 255 +-:-:-:-:00 S2R tidYY, SR_TID.X; +-:-:-:-:00 LDS mm, [RZ + addr_m]; +-:-:-:-:00 LDS q, [RZ + addr_q]; + +// c = crst / RST +// rst = crst % RST +-:-:-:-:00 IMAD.U32.U32 c, crst, param_magic_RST, RZ; +-:-:-:-:00 SHR.U32 c, c, param_shift_RST; +-:-:-:-:00 IMAD rst, c, param_RST, RZ; +-:-:-:-:00 IADD rst, -rst, crst; + +// t = rst / RS +// rs = rst % RS +-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ; +-:-:-:-:00 SHR.U32 t, t, param_shift_RS; +-:-:-:-:00 IMAD rs, t, param_RS, RZ; +-:-:-:-:00 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ; +-:-:-:-:00 SHR.U32 r, r, param_shift_S; +-:-:-:-:00 IMAD s, r, param_S, RZ; +-:-:-:-:00 IADD s, -s, rs; +// y = p * u - pad_h + r +// z = m * w - pad_d + t +-:-:-:-:00 IMAD pr, p, param_str_h, RZ; +-:-:-:-:00 IMAD mt, mm, param_str_d, RZ; +-:-:-:-:00 IADD y, pr, -param_pad_h; +-:-:-:-:00 IADD y, y, r; +-:-:-:-:00 IADD z, mt, -param_pad_d; +-:-:-:-:00 IADD z, z, t; +// e = k * MPQN + m * PQN + p * QN + tidYY +// tidYY = 0, 4 +-:-:-:-:00 LOP.AND tidYY, tidYY, 1; +-:-:-:-:00 SHL tidYY, tidYY, 2; +-:-:-:-:00 IMAD.U32.U32 e, p, param_QN, tidYY; +-:-:-:-:00 IMAD.U32.U32 e, mm, param_PQN, e; +-:-:-:-:00 IMAD.U32.U32 e, k, param_MPQN, e; +// i = c * DHWN + z * HWN + y * WN + tidYY +-:-:-:-:00 IMAD.U32.U32 i, y, param_WN, tidYY; +-:-:-:-:00 IMAD.U32.U32 i, z, param_HWN, i; +-:-:-:-:00 IMAD.U32.U32 i, c, param_DHWN, i; +// mode1 +// -:-:-:-:00 MOV tmp_param0, param_test[0]; +// -:-:-:-:00 MOV tmp_param1, param_test[1]; +// -:-:-:-:00 SHL tmp_shl, tid, 0x2; +// -:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0; +// -:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1; +// -:-:-:-:00 I2F.F32.U32 i, i; +// -:-:-:-:00 ST.E [tmp_param00], i; +// -:-:-:-:00 EXIT; +// bounds_yz = y < 0 || y > H || z < 0 || z > D ? -1 : 0 +-:-:-:-:00 ISET.LT.AND y0, y, RZ, PT; +-:-:-:-:00 ISET.GE.AND yH, y, param_H, PT; +-:-:-:-:00 ISET.LT.AND z0, z, RZ, PT; +-:-:-:-:00 ISET.GE.AND zD, z, param_D, PT; +-:-:-:-:00 LOP.OR bounds_yz, y0, yH; +-:-:-:-:00 LOP.OR bounds_yz, bounds_yz, z0; +-:-:-:-:00 LOP.OR bounds_yz, bounds_yz, zD; +// doLoadCRST = crst < CRST && bounds_yz == 0 +-:-:-:-:00 ISETP.LT.AND P4, PT, crst, param_CRST, PT; +-:-:-:-:00 ISETP.EQ.AND P4, PT, bounds_yz, RZ, P4; +// p += grid_P +// p = p + 1 +-:-:-:-:00 IADD p, p, param_grid_P; + +-:-:-:-:00 ISETP.LT.AND P6, PT, p, param_P, PT; + +NEXT_Q: + +// Zigzag q but only if grid_P < P +-:-:-:-:00 LOP.AND p_and, p, 1; +// useless? +-:-:-:-:00 ISETP.NE.AND P1, PT, RZ, p, PT; +// Q = 1 +-:-:-:-:00 MOV Q, param_grid_P; +// 1 < param_P ? Q = -1 + -q + paramQ : Q = 0 +-:-:-:-:00 ISETP.LT.AND P1, PT, Q, param_P, P1; +-:-:-:-:00 MOV32I Q, -1; +-:-:-:-:00 @P1 IADD tmp_data, -q, param_Q; +-:-:-:-:00 @P1 IADD Q, tmp_data, Q; +-:-:-:-:00 @!P1 MOV Q, q; +// k < K +-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, PT; +// qs = q * v - pad_w +// x = qs + s +-:-:-:-:00 IMAD qs, Q, param_str_w, RZ; +-:-:-:-:00 IADD x, qs, -param_pad_w; +-:-:-:-:00 IADD x, x, s; +// bounds_x = x < 0 || x > W ? -1 : 0 +-:-:-:-:00 ISET.LT.AND x0, x, RZ, PT; +-:-:-:-:00 ISET.GE.AND xW, x, param_W, PT; +-:-:-:-:00 LOP.OR bounds_x, x0, xW; +// doLoad = crst < CRST && bounds_yz == 0 && bounds_x == 0 +-:-:-:-:00 ISETP.EQ.AND P2, PT, bounds_x, RZ, P4; +// trackI = I + i + x * N +-:-:-:-:00 IMAD ti, x, param_N, i; +//-:-:-:-:00 LEA trackI0.CC, ti, param_I[0], 2; +//-:-:-:-:00 LEA.HI.X trackI1, ti, param_I[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_I[0]; +-:-:-:-:00 MOV tmp_param1, param_I[1]; +-:-:-:-:00 SHL tmp_shl, ti, 0x2; +-:-:-:-:00 IADD trackI0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackI1, RZ, tmp_param1; +// trackE = E + e + q * N +-:-:-:-:00 IMAD te, Q, param_N, e; +//-:-:-:-:00 LEA trackE0.CC, te, param_E[0], 2; +//-:-:-:-:00 LEA.HI.X trackE1, te, param_E[1], RZ, 2; +-:-:-:-:00 MOV tmp_param0, param_E[0]; +-:-:-:-:00 MOV tmp_param1, param_E[1]; +-:-:-:-:00 SHL tmp_shl, te, 0x2; +-:-:-:-:00 IADD trackE0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackE1, RZ, tmp_param1; +// q += grid_Q +// q = q + 1 +-:-:-:-:00 IADD q, q, param_grid_Q; +-:-:-:-:00 ISETP.LT.AND P5, PT, q, param_Q, PT; + +-:-:-:-:00 @!P0 IADD loopN, loopN, param_N; + +-:-:-:-:00 @!P0 BRA.U NEXT_PQ; + +-:-:-:-:00 PSETP.AND.AND P0, PT, PT, PT, !PT; + +-:-:-:-:00 @P2 LD.E.CI.128 loadI0, [trackI + 4x<0>]; +-:-:-:-:00 @P2 LD.E.CI.128 loadI4, [trackI + 4x<8>]; +-:-:-:-:00 @!P2 LDS.128 loadI0, [RZ + addr_zero]; +-:-:-:-:00 @!P2 LDS.128 loadI4, [RZ + addr_zero]; + +-:-:-:-:00 ISETP.LE.AND P1, PT, loopN, 32, PT; + +-:-:-:-:00 @P3 LD.E.CI.128 loadE0, [trackE + 4x<0>]; +-:-:-:-:00 @P3 LD.E.CI.128 loadE4, [trackE + 4x<8>]; +-:-:-:-:00 @!P3 LDS.128 loadE0, [RZ + addr_zero]; +-:-:-:-:00 @!P3 LDS.128 loadE4, [RZ + addr_zero]; + +-:-:-:-:00 STS [writeS + 4x< 0*128>], loadI0; +-:-:-:-:00 STS [writeS + 4x< 1*128>], loadI1; +-:-:-:-:00 STS [writeS + 4x< 2*128>], loadI2; +-:-:-:-:00 STS [writeS + 4x< 3*128>], loadI3; + +-:-:-:-:00 STS [writeS + 4x< 8*128 + 16>], loadI4; +-:-:-:-:00 STS [writeS + 4x< 9*128 + 16>], loadI5; +-:-:-:-:00 STS [writeS + 4x<10*128 + 16>], loadI6; +-:-:-:-:00 STS [writeS + 4x<11*128 + 16>], loadI7; + +-:-:-:-:00 IADD trackI0.CC, trackI0, 4x<16>; +-:-:-:-:00 PSETP.AND.AND P5, PT, P1, P5, PT; + +-:-:-:-:00 STS [writeS + 4x< 0*128 + szBuf>], loadE0; +-:-:-:-:00 STS [writeS + 4x< 1*128 + szBuf>], loadE1; +-:-:-:-:00 STS [writeS + 4x< 2*128 + szBuf>], loadE2; +-:-:-:-:00 STS [writeS + 4x< 3*128 + szBuf>], loadE3; + +-:-:-:-:00 PSETP.AND.AND P6, PT, P1, P6, PT; + +-:-:-:-:00 STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4; +-:-:-:-:00 STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5; +-:-:-:-:00 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6; +-:-:-:-:00 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7; + +-:-:-:-:00 IADD.X trackI1, trackI1, RZ; + +-:-:-:-:00 IADD trackE0.CC, trackE0, 4x<16>; + +-:-:-:-:00 IADD readEs, readEs, -swapBuf; +-:-:-:-:00 IADD readIs, readIs, -swapBuf; +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 IADD writeS, writeS, swapBuf; +-:-:-:-:00 IADD swapBuf, RZ, -swapBuf; + +-:-:-:-:00 IADD.X trackE1, trackE1, RZ; + +-:-:-:-:00 @P2 LD.E.CI.128 loadI0, [trackI + 4x<0>]; +-:-:-:-:00 @P2 LD.E.CI.128 loadI4, [trackI + 4x<8>]; +-:-:-:-:00 @P3 LD.E.CI.128 loadE0, [trackE + 4x<0>]; +-:-:-:-:00 @P3 LD.E.CI.128 loadE4, [trackE + 4x<8>]; + +-:-:-:-:00 @P2 IADD trackI0.CC, trackI0, 4x<16>; +-:-:-:-:00 @P2 IADD.X trackI1, trackI1, RZ; +-:-:-:-:00 @P3 IADD trackE0.CC, trackE0, 4x<16>; +-:-:-:-:00 @P3 IADD.X trackE1, trackE1, RZ; + +-:-:-:-:00 @P5 BRA.U NEXT_Q; +-:-:-:-:00 @P6 BRA.U NEXT_P; + +-:-:-:-:00 ISETP.LT.AND P5, PT, q, param_Q, PT; +-:-:-:-:00 ISETP.LT.AND P6, PT, p, param_P, PT; + +NEXT_PQ: + +-:-:-:-:00 LDS.128 j0Ex0, [readEs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS.128 j0Ex4, [readEs + 4x<0*128 + 64>]; +-:-:-:-:00 LDS.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + +// P0 loop N +// P2 bounds I +// P3 bounds E +// P4 bounds yz +// P5 loop Q +// P6 loop P + +//loop = N >= 16 && (N >= 32 || (!p5 && !p6)) + +NEXT_16N: + + + + my %insert = + ( + j0c8 => "-:-:-:-:00 IADD loopN, loopN, -16;\n", + j0c14 => "-:-:-:-:00 ISETP.GE.AND P0, PT, loopN, 16, PT;\n", + + j4c8 => "-:-:-:-:00 \@P0 STS [writeS + 4x< 0*128>], loadI0;\n", + j4c10 => "-:-:-:-:00 \@P0 STS [writeS + 4x< 1*128>], loadI1;\n", + j4c12 => "-:-:-:-:00 \@P0 STS [writeS + 4x< 2*128>], loadI2;\n", + j4c14 => "-:-:-:-:00 \@P0 STS [writeS + 4x< 3*128>], loadI3;\n", + + j5c8 => "-:-:-:-:00 \@P0 STS [writeS + 4x< 8*128 + 16>], loadI4;\n", + j5c10 => "-:-:-:-:00 \@P0 STS [writeS + 4x< 9*128 + 16>], loadI5;\n", + j5c12 => "-:-:-:-:00 \@P0 STS [writeS + 4x<10*128 + 16>], loadI6;\n", + j5c14 => "-:-:-:-:00 \@P0 STS [writeS + 4x<11*128 + 16>], loadI7;\n", + + j5c16 => "-:-:-:-:00 ISETP.GE.AND P2, PT, loopN, 32, P2;\n", + + j5c60 => "-:-:-:-:00 \@P2 LD.E.CI.128 loadI0, [trackI + 4x<0>];\n", + j5c62 => "-:-:-:-:00 \@P2 LD.E.CI.128 loadI4, [trackI + 4x<8>];\n", + + j6c16 => "-:-:-:-:00 \@!P2 LDS.128 loadI0, [RZ + addr_zero];\n", + j7c16 => "-:-:-:-:00 \@!P2 LDS.128 loadI4, [RZ + addr_zero];\n", + + j10c57 => "-:-:-:-:00 \@P2 IADD trackI0.CC, trackI0, 4x<16>;\n", + j10c62 => "-:-:-:-:00 \@P2 IADD.X trackI1, trackI1, RZ;\n", + + j12c8 => "-:-:-:-:00 \@P0 STS [writeS + 4x<0*128 + szBuf>], loadE0;\n", + j12c10 => "-:-:-:-:00 \@P0 STS [writeS + 4x<1*128 + szBuf>], loadE1;\n", + j12c12 => "-:-:-:-:00 \@P0 STS [writeS + 4x<2*128 + szBuf>], loadE2;\n", + j12c14 => "-:-:-:-:00 \@P0 STS [writeS + 4x<3*128 + szBuf>], loadE3;\n", + + j13c8 => "-:-:-:-:00 \@P0 STS [writeS + 4x<8*128 + szBuf + 16>], loadE4;\n", + j13c10 => "-:-:-:-:00 \@P0 STS [writeS + 4x<9*128 + szBuf + 16>], loadE5;\n", + j13c12 => "-:-:-:-:00 \@P0 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;\n", + j13c14 => "-:-:-:-:00 \@P0 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;\n", + + j13c16 => "-:-:-:-:00 ISETP.GE.AND P3, PT, loopN, 32, P3;\n", + + j13c60 => "-:-:-:-:00 \@P3 LD.E.CI.128 loadE0, [trackE + 4x<0>];\n", + j13c62 => "-:-:-:-:00 \@P3 LD.E.CI.128 loadE4, [trackE + 4x<8>];\n", + + j14c16 => "-:-:-:-:00 @!P3 LDS.128 loadE0, [RZ + addr_zero];\n", + j15c16 => "-:-:-:-:00 @!P3 LDS.128 loadE4, [RZ + addr_zero];\n", + + j15c57 => "-:-:-:-:00 \@P3 IADD trackE0.CC, trackE0, 4x<16>;\n", + j15c62 => "-:-:-:-:00 \@P3 IADD.X trackE1, trackE1, RZ;\n", + + j14c63 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n" . + "-:-:-:-:00 \@P0 IADD readEs, readEs, -swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD writeS, writeS, swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j15c24 => "-:-:-:-:00 ISETP.GT.AND P1, PT, loopN, 32, PT;\n", + j15c37 => "-:-:-:-:00 PSETP.AND.OR P1, PT, !P5, !P6, P1;\n", + j15c50 => "-:-:-:-:00 PSETP.AND.AND P0, PT, P0, P1, PT;\n", + + j15c63 => "-:-:-:-:00 \@P0 BRA.U NEXT_16N;\n" . + "-:-:-:-:00 \@P5 BRA.U NEXT_Q;\n" . + "-:-:-:-:00 \@P6 BRA.U NEXT_P;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "-:-:-:-:00 %s LDS.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "-:-:-:-:00 %s LDS.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "-:-:-:-:00 %s LDS.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "-:-:-:-:00 %s LDS.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|LD|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "-:-:-:-:00"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkI, SR_CTAID.Y; +-:-:-:-:00 S2R blkE, SR_CTAID.Z; + +-:-:-:-:00 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +-:-:-:-:00 IADD readEs, readEs, -4x; +-:-:-:-:00 @P0 IADD readIs, readIs, -swapBuf; +-:-:-:-:00 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 128 + readEs; +-:-:-:-:00 ISCADD writeCs, readIs, readEs, 5; + +-:-:-:-:00 LOP.AND tid31, tid, 31; +-:-:-:-:00 LOP.AND tid96, tid, 96; +-:-:-:-:00 LOP.AND t128, tid, 128; + +// kk = tid31 | (t128 >> 2); +-:-:-:-:00 SHR.U32 kk, t128, 2; +-:-:-:-:00 LOP.OR kk, tid31, kk; + +// readCs = ((tid96 << 4) | kk) << 2; +-:-:-:-:00 SHL readCs, tid96, 4; +-:-:-:-:00 LOP.OR readCs, readCs, kk; +-:-:-:-:00 SHL readCs, readCs, 2; + +// kk += blkE*128; +-:-:-:-:00 ISCADD kk, blkE, kk, 7; + +// crst = blkI*128 + (tid96 >> 1) +-:-:-:-:00 SHR.U32 crst00, tid96, 1; +-:-:-:-:00 ISCADD crst00, blkI, crst00, 7; +-:-:-:-:00 IADD crst04, crst00, 4; +-:-:-:-:00 IADD crst08, crst00, 8; +-:-:-:-:00 IADD crst12, crst00, 12; + +-:-:-:-:00 MOV K, param_K; +-:-:-:-:00 SHL K1, K, 2; +-:-:-:-:00 SHL K4, K, 4; +-:-:-:-:00 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +-:-:-:-:00 IMAD tmp_param0, crst00, K, RZ; +-:-:-:-:00 IADD tf, tmp_param0, kk; +//-:-:-:-:00 LEA track00F0.CC, tf, param_F[0], 0x2; +//-:-:-:-:00 LEA.HI.X track00F1, tf, param_F[1], RZ, 0x2; +-:-:-:-:00 MOV tmp_param0, param_F[0]; +-:-:-:-:00 MOV tmp_param1, param_F[1]; +-:-:-:-:00 SHL tmp_shl, tf, 0x2; +-:-:-:-:00 IADD track00F0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X track00F1, RZ, tmp_param1; + +// kk < K +-:-:-:-:00 ISETP.LT.AND P5, PT, kk, param_K, PT; +-:-:-:-:00 IADD kk, kk, 64; +-:-:-:-:00 ISETP.LT.AND P6, PT, kk, param_K, PT; + +-:-:-:-:00 MOV alpha, param_alpha; + +-:-:-:-:00 IADD track04F0.CC, track00F0, K4; +-:-:-:-:00 IADD.X track04F1, track00F1, RZ; +-:-:-:-:00 IADD track08F0.CC, track04F0, K4; +-:-:-:-:00 IADD.X track08F1, track04F1, RZ; +-:-:-:-:00 IADD track12F0.CC, track08F0, K4; +-:-:-:-:00 IADD.X track12F1, track08F1, RZ; + +-:-:-:-:00 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "-:-:-:-:00 IADD track00F0.CC, track00F0, K60;\n" . + "-:-:-:-:00 IADD crst00, crst00, 60;\n" . + "-:-:-:-:00 IADD.X track00F1, track00F1, RZ;\n" . + "-:-:-:-:00 IADD track04F0.CC, track04F0, K60;\n" . + "-:-:-:-:00 IADD crst04, crst04, 60;\n" . + "-:-:-:-:00 IADD.X track04F1, track04F1, RZ;\n" . + "-:-:-:-:00 IADD track08F0.CC, track08F0, K60;\n" . + "-:-:-:-:00 IADD crst08, crst08, 60;\n" . + "-:-:-:-:00 IADD.X track08F1, track08F1, RZ;\n" . + "-:-:-:-:00 IADD track12F0.CC, track12F0, K60;\n" . + "-:-:-:-:00 IADD crst12, crst12, 60;\n" . + "-:-:-:-:00 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL c4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL c5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL c6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "-:-:-:-:00 CAL STORE_C;\n\n"; + } + return $out; + + + +-:-:-:-:00 EXIT; + +STORE_C: + +-:-:-:-:00 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +-:-:-:-:00 IADD crst00, crst00, 1; +-:-:-:-:00 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +-:-:-:-:00 IADD crst04, crst04, 1; +-:-:-:-:00 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +-:-:-:-:00 IADD crst08, crst08, 1; +-:-:-:-:00 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +-:-:-:-:00 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +-:-:-:-:00 STS.128 [writeCs+4x<00>], c0; +-:-:-:-:00 STS.128 [writeCs+4x<64>], c4; + +-:-:-:-:00 LDS c0, [readCs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS c2, [readCs + 4x<1*128 + 00>]; +-:-:-:-:00 LDS c4, [readCs + 4x<2*128 + 00>]; +-:-:-:-:00 LDS c6, [readCs + 4x<3*128 + 00>]; + +-:-:-:-:00 @P0 RED.E.ADD.F32.FTZ.RN [track00F], c0; +-:-:-:-:00 PSETP.AND.AND P0, PT, P0, P6, PT; +-:-:-:-:00 @P1 RED.E.ADD.F32.FTZ.RN [track04F], c2; +-:-:-:-:00 PSETP.AND.AND P1, PT, P1, P6, PT; +-:-:-:-:00 @P2 RED.E.ADD.F32.FTZ.RN [track08F], c4; +-:-:-:-:00 PSETP.AND.AND P2, PT, P2, P6, PT; +-:-:-:-:00 @P3 RED.E.ADD.F32.FTZ.RN [track12F], c6; +-:-:-:-:00 PSETP.AND.AND P3, PT, P3, P6, PT; + +-:-:-:-:00 LDS c1, [readCs + 4x<0*128 + 64>]; +-:-:-:-:00 LDS c3, [readCs + 4x<1*128 + 64>]; +-:-:-:-:00 LDS c5, [readCs + 4x<2*128 + 64>]; +-:-:-:-:00 LDS c7, [readCs + 4x<3*128 + 64>]; + +-:-:-:-:00 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], c1; +-:-:-:-:00 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], c3; +-:-:-:-:00 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], c5; +-:-:-:-:00 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], c7; + +-:-:-:-:00 IADD track00F0.CC, track00F0, K1; +-:-:-:-:00 IADD.X track00F1, track00F1, RZ; +-:-:-:-:00 IADD track04F0.CC, track04F0, K1; +-:-:-:-:00 IADD.X track04F1, track04F1, RZ; +-:-:-:-:00 IADD track08F0.CC, track08F0, K1; +-:-:-:-:00 IADD.X track08F1, track08F1, RZ; +-:-:-:-:00 IADD track12F0.CC, track12F0, K1; +-:-:-:-:00 IADD.X track12F1, track12F1, RZ; + +-:-:-:-:00 RET; diff --git a/Kernel/Convolution/Maxwell/hconv_bprop_C1_N64.sass b/Kernel/Convolution/Maxwell/hconv_bprop_C1_N64.sass new file mode 100644 index 0000000..fb00d82 --- /dev/null +++ b/Kernel/Convolution/Maxwell/hconv_bprop_C1_N64.sass @@ -0,0 +1,663 @@ +# Kernel: hconv_bprop_C32_N64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert { return $convert; } + + our $dtype = $int16 ? 'S16' : 'U16'; + sub dtype { return $dtype; } +-] + + + addr_lut : 4x<64*4> + + param_I[0] : c[0x0][0x140] + param_I[1] : c[0x0][0x144] + param_E[0] : c[0x0][0x148] + param_E[1] : c[0x0][0x14c] + param_F[0] : c[0x0][0x150] + param_F[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_N : c[0x0][0x15c] + param_K : c[0x0][0x160] + param_D : c[0x0][0x164] + param_H : c[0x0][0x168] + param_W : c[0x0][0x16c] + param_WN : c[0x0][0x170] + param_HWN : c[0x0][0x174] + param_DHWN : c[0x0][0x178] + param_C : c[0x0][0x17c] + param_CRST : c[0x0][0x180] + param_RST : c[0x0][0x184] + param_magic_RST : c[0x0][0x188] + param_shift_RST : c[0x0][0x18c] + param_RS : c[0x0][0x190] + param_magic_RS : c[0x0][0x194] + param_shift_RS : c[0x0][0x198] + param_S : c[0x0][0x19c] + param_magic_S : c[0x0][0x1a0] + param_shift_S : c[0x0][0x1a4] + param_pad_d : c[0x0][0x1a8] + param_pad_h : c[0x0][0x1ac] + param_pad_w : c[0x0][0x1b0] + param_str_d : c[0x0][0x1b4] + param_str_h : c[0x0][0x1b8] + param_str_w : c[0x0][0x1bc] + param_Q : c[0x0][0x1c0] + param_PQ : c[0x0][0x1c4] + param_QN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_MPQN : c[0x0][0x1d0] + param_magic_Q : c[0x0][0x1d4] + param_shift_Q : c[0x0][0x1d8] + param_magic_PQ : c[0x0][0x1dc] + param_shift_PQ : c[0x0][0x1e0] + param_CRST8 : c[0x0][0x1e4] + param_MPQN8 : c[0x0][0x1e8] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkE, blkF, blkMPQ + + 68-119 ~ k<0|4>, tidFX, tidEX, tid1, tid7, m, p, q, crst, n, tf<0|4>, te, te<0|4>, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-79 : j0Ex<0-7>, j0Fy<0-7> + 80-95 : j1Ex<0-7>, j1Fy<0-7> + + 96-103 : load0F<0-3>, load4F<0-3> + 96-103 : store0F<0-3>, store4F<0-3> + + 104-107 : load0E<0-3> + 104-107 : store0E<0-3> + 112-115 : store0E<4-7> + + 108-111 : load4E<0-3> + 108-111 : store4E<0-3> + 112-115 : store4E<4-7> + + 116-119 : track0F<0-1>, track4F<0-1> + 120-123 : track0E<0-1>, track4E<0-1> + + 124-127 ~ writeEs, writeFs, swapBuf, K + 128-132 ~ readEs, readFs, mt, pr, qs + + 68-71 ~ lutStore, sliceI + 72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD + + 72-93 : c<0-7>, cs<0-3>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1> + 94-127 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkF, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; + + +// tidFX = (tid & 7) << 2 +// tidEX = (tid & 7) << 3 +// k = tid >> 3 +01:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHL tidFX, tid7, 2; +--:-:-:-:1 SHL tidEX, tid7, 3; +--:-:-:-:1 SHR.U32 k0, tid, 3; +--:-:-:-:1 IADD k4, k0, 4; + +--:-:-:-:1 MOV K, param_K; + +--:-:-:-:1 STS.128 [RZ], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15; + + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; + +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, q, param_str_w, RZ; +--:-:-:-:1 IADD mt, mt, -param_pad_d; +--:-:-:-:1 IADD pr, pr, -param_pad_h; +--:-:-:-:1 IADD qs, qs, -param_pad_w; + +// crst = blkF*32 + tidX +// n = blkE*64 + tidX +04:-:-:-:1 ISCADD crst, blkF, tidFX, 5; +08:-:-:-:1 ISCADD n, blkE, tidEX, 6; + +// trackF = k*CRST + crst +--:-:-:-:1 XMAD tf0, k0, param_CRST, crst; +--:-:-:-:1 XMAD tf4, k4, param_CRST, crst; +--:-:-:-:1 LEA track0F0.CC, tf0, param_F[0], 1; +--:-:-:-:1 LEA.HI.X track0F1, tf0, param_F[1], RZ, 1; +--:-:-:-:1 LEA track4F0.CC, tf4, param_F[0], 1; +--:-:-:-:1 LEA.HI.X track4F1, tf4, param_F[1], RZ, 1; + +// trackE = k*MPQN + m*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD te, q, param_N, n; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te0, k0, param_MPQN, te; +--:-:-:-:1 XMAD.LO2C te4, k4, param_MPQN, te; +--:-:-:-:1 LEA track0E0.CC, te0, param_E[0], 1; +--:-:-:-:1 LEA.HI.X track0E1, te0, param_E[1], RZ, 1; +--:-:-:-:1 LEA track4E0.CC, te4, param_E[0], 1; +--:-:-:-:1 LEA.HI.X track4E1, te4, param_E[1], RZ, 1; + +// P1 = crst < CRST +// P2 = n < N +// P3 = n+32 < N +--:-:-:-:1 ISETP.LT.AND P1, PT, crst, param_CRST, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, n, param_N, PT; + +// Remap the EX dim to avoid bank conflicts when storing to shared +// We can unmap this in the output + +// writeFs = (32*k + tidFX) * 4 +--:-:-:-:1 ISCADD writeFs, k0, tidFX, 5; +--:-:-:-:1 SHL writeFs, writeFs, 2; +// writeEs = (64*k + tidFX) * 4 (tidFX here not a bug) +--:-:-:-:1 ISCADD writeEs, k0, tidFX, 6; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x<32*8>, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readEs = ((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readEs, readEs, 4x<32*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, 4x<32*8 + 64*8>; + + +--:-:-:-:0 IADD K, K, -8; + +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F0, [track0F + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F1, [track0F + 2x<1>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F2, [track0F + 2x<2>]; +--:-:1:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F3, [track0F + 2x<3>]; + +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F0, [track4F + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F1, [track4F + 2x<1>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F2, [track4F + 2x<2>]; +--:-:2:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F3, [track4F + 2x<3>]; + +--:-:-:-:0 ISETP.GT.AND P1, PT, K, RZ, P1; + +--:-:3:-:1 @P2 LDG.E.128 load0E0, [track0E]; +--:-:4:-:1 @P2 LDG.E.128 load4E0, [track4E]; + +--:-:-:-:0 ISETP.GT.AND P2, PT, K, RZ, P2; + +01:-:-:-:1 [+ convert() +] store0F0, load0F0; +--:-:-:-:1 [+ convert() +] store0F1, load0F1; +--:-:-:-:1 [+ convert() +] store0F2, load0F2; +--:-:1:-:1 [+ convert() +] store0F3, load0F3; +--:-:-:-:6 IADD track0F0.CC, track0F0, param_CRST8; +--:-:-:-:0 IADD.X track0F1, track0F1, RZ; +01:-:-:-:1 STS.128 [writeFs + 4x<0*32>], store0F; + +02:-:-:-:1 [+ convert() +] store4F0, load4F0; +--:-:-:-:1 [+ convert() +] store4F1, load4F1; +--:-:-:-:1 [+ convert() +] store4F2, load4F2; +--:-:2:-:1 [+ convert() +] store4F3, load4F3; +--:-:-:-:6 IADD track4F0.CC, track4F0, param_CRST8; +--:-:-:-:0 IADD.X track4F1, track4F1, RZ; +02:-:-:-:1 STS.128 [writeFs + 4x<4*32>], store4F; + +04:-:-:-:1 [+ convert() +] store0E7, load0E3.H1; +--:-:-:-:1 [+ convert() +] store0E6, load0E3.H0; +--:-:-:-:1 [+ convert() +] store0E5, load0E2.H1; +--:-:1:-:1 [+ convert() +] store0E4, load0E2.H0; +--:-:-:-:1 [+ convert() +] store0E3, load0E1.H1; +--:-:-:-:1 [+ convert() +] store0E2, load0E1.H0; +--:-:-:-:1 [+ convert() +] store0E1, load0E0.H1; +--:-:2:-:1 [+ convert() +] store0E0, load0E0.H0; +--:-:-:-:6 IADD track0E0.CC, track0E0, param_MPQN8; +--:-:-:-:0 IADD.X track0E1, track0E1, RZ; +01:-:-:-:1 STS.128 [writeEs + 4x<0*64 + 32>], store0E4; +02:1:-:-:2 STS.128 [writeEs + 4x<0*64 + 0>], store0E0; + +09:-:-:-:1 [+ convert() +] store4E7, load4E3.H1; +--:-:-:-:1 [+ convert() +] store4E6, load4E3.H0; +--:-:-:-:1 [+ convert() +] store4E5, load4E2.H1; +--:-:1:-:1 [+ convert() +] store4E4, load4E2.H0; +--:-:-:-:1 [+ convert() +] store4E3, load4E1.H1; +--:-:-:-:1 [+ convert() +] store4E2, load4E1.H0; +--:-:-:-:1 [+ convert() +] store4E1, load4E0.H1; +--:-:2:-:1 [+ convert() +] store4E0, load4E0.H0; +--:-:-:-:6 IADD track4E0.CC, track4E0, param_MPQN8; +--:-:-:-:0 IADD.X track4E1, track4E1, RZ; +01:-:-:-:1 STS.128 [writeEs + 4x<4*64 + 32>], store4E4; +02:1:-:-:2 STS.128 [writeEs + 4x<4*64 + 0>], store4E0; + + +01:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD writeFs, writeFs, swapBuf; +--:-:-:-:2 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD K, K, -8; + +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>]; + +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F0, [track0F + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F1, [track0F + 2x<1>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F2, [track0F + 2x<2>]; +--:-:2:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F3, [track0F + 2x<3>]; + +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F0, [track4F + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F1, [track4F + 2x<1>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F2, [track4F + 2x<2>]; +--:-:3:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F3, [track4F + 2x<3>]; + +--:-:-:-:0 ISETP.GT.AND P1, PT, K, RZ, P1; + +--:-:4:-:1 @P2 LDG.E.128 load0E0, [track0E]; +--:-:5:-:1 @P2 LDG.E.128 load4E0, [track4E]; + +--:-:-:-:2 ISETP.GT.AND P2, PT, K, RZ, P2; + +NEXT_8K: +--:-:-:-:1 ISETP.GT.AND P0, PT, K, -8, PT; + +[+ + our $convert; + our $dtype; + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD K, K, -8;\n", + + j0c12 => "02:-:-:-:1 \@P0 $convert store0F0, load0F0;\n", + j0c16 => "--:-:-:-:1 \@P0 $convert store0F1, load0F1;\n", + j0c20 => "--:-:-:-:1 \@P0 $convert store0F2, load0F2;\n", + j0c24 => "--:-:2:-:1 \@P0 $convert store0F3, load0F3;\n", + j0c26 => "--:-:-:-:1 \@P0 IADD track0F0.CC, track0F0, param_CRST8;\n", + j0c31 => "--:-:-:-:1 \@P0 IADD.X track0F1, track0F1, RZ;\n", + j0c38 => "02:2:-:-:1 \@P0 STS.128 [writeFs + 4x<0*32>], store0F;\n", + j1c8 => "02:-:-:-:1 \@P1 LDG.E.CI.$dtype load0F0, [track0F + 2x<0>];\n", + j1c10 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype load0F1, [track0F + 2x<1>];\n", + j1c12 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype load0F2, [track0F + 2x<2>];\n", + j1c14 => "--:-:2:-:1 \@P1 LDG.E.CI.$dtype load0F3, [track0F + 2x<3>];\n", + + j2c12 => "04:-:-:-:1 \@P0 $convert store4F0, load4F0;\n", + j2c16 => "--:-:-:-:1 \@P0 $convert store4F1, load4F1;\n", + j2c20 => "--:-:-:-:1 \@P0 $convert store4F2, load4F2;\n", + j2c24 => "--:-:3:-:1 \@P0 $convert store4F3, load4F3;\n", + j2c26 => "--:-:-:-:1 \@P0 IADD track4F0.CC, track4F0, param_CRST8;\n", + j2c31 => "--:-:-:-:1 \@P0 IADD.X track4F1, track4F1, RZ;\n", + j2c38 => "04:3:-:-:1 \@P0 STS.128 [writeFs + 4x<4*32>], store4F;\n", + j3c8 => "04:-:-:-:1 \@P1 LDG.E.CI.$dtype load4F0, [track4F + 2x<0>];\n", + j3c10 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype load4F1, [track4F + 2x<1>];\n", + j3c12 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype load4F2, [track4F + 2x<2>];\n", + j3c14 => "--:-:3:-:1 \@P1 LDG.E.CI.$dtype load4F3, [track4F + 2x<3>];\n", + + j4c12 => "08:-:-:-:1 \@P0 $convert store0E7, load0E3.H1;\n", + j4c16 => "--:-:-:-:1 \@P0 $convert store0E6, load0E3.H0;\n", + j4c20 => "--:-:-:-:1 \@P0 $convert store0E5, load0E2.H1;\n", + j4c24 => "--:-:6:-:1 \@P0 $convert store0E4, load0E2.H0;\n", + j4c28 => "--:-:-:-:1 \@P0 $convert store0E3, load0E1.H1;\n", + j4c32 => "--:-:-:-:1 \@P0 $convert store0E2, load0E1.H0;\n", + j4c36 => "--:-:-:-:1 \@P0 $convert store0E1, load0E0.H1;\n", + j4c40 => "--:-:4:-:1 \@P0 $convert store0E0, load0E0.H0;\n", + j4c42 => "20:-:-:-:1 \@P0 STS.128 [writeEs + 4x<0*64 + 32>], store0E4;\n", + j4c44 => "--:-:-:-:1 \@P0 IADD track0E0.CC, track0E0, param_MPQN8;\n", + j4c49 => "--:-:-:-:1 \@P0 IADD.X track0E1, track0E1, RZ;\n", + j4c56 => "08:4:-:-:1 \@P0 STS.128 [writeEs + 4x<0*64 + 0>], store0E0;\n", + j5c8 => "08:-:4:-:1 \@P2 LDG.E.128 load0E0, [track0E];\n", + + j5c12 => "10:-:-:-:1 \@P0 $convert store4E7, load4E3.H1;\n", + j5c16 => "--:-:-:-:1 \@P0 $convert store4E6, load4E3.H0;\n", + j5c20 => "--:-:-:-:1 \@P0 $convert store4E5, load4E2.H1;\n", + j5c24 => "--:-:6:-:1 \@P0 $convert store4E4, load4E2.H0;\n", + j5c28 => "--:-:-:-:1 \@P0 $convert store4E3, load4E1.H1;\n", + j5c32 => "--:-:-:-:1 \@P0 $convert store4E2, load4E1.H0;\n", + j5c36 => "--:-:-:-:1 \@P0 $convert store4E1, load4E0.H1;\n", + j5c40 => "--:-:5:-:1 \@P0 $convert store4E0, load4E0.H0;\n", + j5c42 => "20:-:-:-:1 \@P0 STS.128 [writeEs + 4x<4*64 + 32>], store4E4;\n", + j5c44 => "--:-:-:-:1 \@P0 IADD track4E0.CC, track4E0, param_MPQN8;\n", + j5c49 => "--:-:-:-:1 \@P0 IADD.X track4E1, track4E1, RZ;\n", + j5c56 => "10:5:-:-:1 \@P0 STS.128 [writeEs + 4x<4*64 + 0>], store4E0;\n", + j6c8 => "10:-:5:-:1 \@P2 LDG.E.128 load4E0, [track4E];\n", + + j6c63 => "20:-:-:-:1 \@P0 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c8 => "--:-:-:-:1 ISETP.GT.AND P1, PT, K, RZ, P1;\n", + j7c10 => "--:-:-:-:1 ISETP.GT.AND P2, PT, K, RZ, PT;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_8K;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + my $barrier = $j == 6 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $barrier, $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2F|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +--:-:-:-:0 MOV warp_cnt, 32; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkF, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +01:-:-:-:6 MOV rst, tid; + +LUT_LOOP: + + +// warp synchronous loop while warp_cnt < RST (c=0) +--:-:-:-:1 ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT; +--:-:-:-:1 IADD warp_cnt, warp_cnt, 32; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +--:-:-:-:1 IADD z, mt, t; +--:-:-:-:1 IADD y, pr, r; +--:-:-:-:1 IADD x, qs, s; +// i = (z*HWN + y*WN + x*N) * 4 +20:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD sliceI, x, param_N, sliceI; +--:-:-:-:1 SHL sliceI, sliceI, 1; +// Bounds check x and y, and make i negative if outside +--:-:-:-:1 ISET.LT.AND x0, x, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW, x, param_W, PT; +--:-:-:-:1 ISET.LT.AND y0, y, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH, y, param_H, PT; +--:-:-:-:1 ISET.LT.AND z0, z, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD, z, param_D, PT; +--:-:-:-:1 LOP3.LUT sliceI, sliceI, x0, xW, 0xfe; + +--:-:-:-:1 LOP3.LUT sliceI, sliceI, y0, yH, 0xfe; +--:-:-:-:1 SHL lutStore, rst, 2; +--:-:-:-:1 IADD rst, rst, 32; + +--:-:-:-:1 LOP3.LUT sliceI, sliceI, z0, zD, 0xfe; +// Store i imgOffset into the shared lookup table +--:6:-:-:1 STS [lutStore + addr_lut], sliceI; + + +--:-:-:-:5 @P0 BRA.U LUT_LOOP; + + + +--:-:-:-:1 MOV RST, param_RST; +--:-:-:-:1 MOV DHWN1, param_DHWN; +--:-:-:-:1 SHL DHWN1, DHWN1, 1; + +--:-:-:-:1 LOP.AND readEs, readEs, 0x7f; +--:-:-:-:1 LOP.AND readFs, readFs, 0x3f; + +// Expand back out to undo our bank conflict avoiding stride +--:-:-:-:1 SHL readEs, readEs, 1; + +// writeCs = ((readIs / 4) * 64 + readEs) / 2; +--:-:-:-:1 ISCADD writeCs, readFs, readEs, 4; +--:-:-:-:1 SHR.U32 writeCs, writeCs, 1; + +// readCs = (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL readCs, tid31, 2; + +// nn = blkE*64 + tid31 << 1; +--:-:-:-:1 SHL tid31, tid31, 1; +04:-:-:-:1 ISCADD nn, blkE, tid31, 6; + +// crst = blkF*32 +02:-:-:-:1 SHL crst00, blkF, 5; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + +--:-:-:-:1 LEA trackI0.CC, nn, param_I[0], 1; +--:-:-:-:1 LEA.HI.X trackI1, nn, param_I[1], RZ, 1; + +// n < N +--:-:-:-:1 ISETP.LT.AND P5, PT, nn, param_N, PT; + +--:-:-:-:1 MOV alpha, param_alpha; + + + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:1 IADD crst00, crst00, 12;\n" . + "--:-:-:-:1 IADD crst04, crst04, 12;\n" . + "--:-:-:-:1 IADD crst08, crst08, 12;\n" . + "--:-:-:-:1 IADD crst12, crst12, 12;\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +// Round nearest +--:-:-:-:1 F2F.F16.F32 c0, c0; +--:-:1:-:1 F2F.F16.F32 c1, c1; +--:-:-:-:1 F2F.F16.F32 c2, c2; +--:-:2:-:1 F2F.F16.F32 c3, c3; +--:-:-:-:1 F2F.F16.F32 c4, c4; +--:-:3:-:1 F2F.F16.F32 c5, c5; +--:-:-:-:1 F2F.F16.F32 c6, c6; +--:-:4:-:1 F2F.F16.F32 c7, c7; + +// Pack 2 16 bit values into 32 bit words +11:-:-:-:2 BFI cs0, c1, 0x1010, c0; +02:-:-:-:2 BFI cs1, c3, 0x1010, c2; +24:-:-:-:2 BFI cs2, c5, 0x1010, c4; +08:-:-:-:0 BFI cs3, c7, 0x1010, c6; + +// Undo the stride in the X dim (items spaced by 32 are actually spaced 4) +--:-:-:-:4 STS.64 [writeCs+2x<0>], cs0; +--:-:-:-:1 STS.64 [writeCs+2x<4>], cs2; +--:-:-:-:1 LDS cs0, [readCs + 2x<0*64>]; +--:-:-:-:1 LDS cs1, [readCs + 2x<1*64>]; +--:-:-:-:1 LDS cs2, [readCs + 2x<2*64>]; +--:-:-:-:1 LDS cs3, [readCs + 2x<3*64>]; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; + +--:-:-:-:1 XMAD.LO2C c00, crst00, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c04, crst04, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c08, crst08, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c12, crst12, param_magic_RST, RZ; + +--:-:-:-:1 SHR.U32 c00, c00, param_shift_RST; +--:-:-:-:1 SHR.U32 c04, c04, param_shift_RST; +--:-:-:-:1 SHR.U32 c08, c08, param_shift_RST; +--:-:-:-:1 SHR.U32 c12, c12, param_shift_RST; + +--:-:-:-:1 VMAD.U16.U16 lut00, -c00, RST, crst00; +--:-:-:-:1 VMAD.U16.U16 lut04, -c04, RST, crst04; +--:-:-:-:1 VMAD.U16.U16 lut08, -c08, RST, crst08; +--:-:-:-:1 VMAD.U16.U16 lut12, -c12, RST, crst12; + +--:-:-:-:1 SHL lut00, lut00, 2; +--:-:-:-:1 SHL lut04, lut04, 2; +--:-:-:-:1 SHL lut08, lut08, 2; +--:-:-:-:1 SHL lut12, lut12, 2; + +--:-:-:-:1 XMAD.LO2 chan00, DHWN1, c00, RZ; +--:-:-:-:1 XMAD.LO2 chan04, DHWN1, c04, RZ; +--:-:-:-:1 XMAD.LO2 chan08, DHWN1, c08, RZ; +--:-:-:-:1 XMAD.LO2 chan12, DHWN1, c12, RZ; + +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 IADD crst12, crst12, 1; + +--:-:1:-:1 @P0 LDS img00, [lut00 + addr_lut]; +--:-:2:-:1 @P1 LDS img04, [lut04 + addr_lut]; +--:-:3:-:1 @P2 LDS img08, [lut08 + addr_lut]; +--:-:4:-:1 @P3 LDS img12, [lut12 + addr_lut]; + + + +01:-:-:-:1 IADD3 track00I0.CC, trackI0, img00, chan00; +--:-:-:-:5 ISETP.GE.AND P0, PT, img00, RZ, P0; +--:-:-:-:1 IADD.X track00I1, trackI1, RZ; + +02:-:-:-:1 IADD3 track04I0.CC, trackI0, img04, chan04; +--:-:-:-:5 ISETP.GE.AND P1, PT, img04, RZ, P1; +--:-:-:-:1 IADD.X track04I1, trackI1, RZ; + +04:-:-:-:1 IADD3 track08I0.CC, trackI0, img08, chan08; +--:-:-:-:5 ISETP.GE.AND P2, PT, img08, RZ, P2; +--:-:-:-:1 IADD.X track08I1, trackI1, RZ; + +08:-:-:-:1 IADD3 track12I0.CC, trackI0, img12, chan12; +--:-:-:-:5 ISETP.GE.AND P3, PT, img12, RZ, P3; +--:-:-:-:0 IADD.X track12I1, trackI1, RZ; + +--:-:-:-:2 @P0 RED.E.ADD.F16x2.FTZ.RN [track00I], cs0; +--:5:-:-:2 @P1 RED.E.ADD.F16x2.FTZ.RN [track04I], cs1; +--:-:-:-:4 @P2 RED.E.ADD.F16x2.FTZ.RN [track08I], cs2; +--:6:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [track12I], cs3; + +--:-:-:-:5 RET; + diff --git a/Kernel/Convolution/Maxwell/hconv_updat_C128_K128.sass b/Kernel/Convolution/Maxwell/hconv_updat_C128_K128.sass new file mode 100644 index 0000000..d6c9c15 --- /dev/null +++ b/Kernel/Convolution/Maxwell/hconv_updat_C128_K128.sass @@ -0,0 +1,775 @@ +# Kernel: hconv_updat_C128_K128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + addr_zero : 4x<(128*16 + 32)*4> + addr_blkIE : 4x<(128*16 + 32)*4 + 4> + addr_q : 4x<(128*16 + 32)*4 + 6> + szBuf : (128*16 + 32) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_offset_K : c[0x0][0x15c] + param_N : c[0x0][0x160] + param_K : c[0x0][0x164] + param_D : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_WN : c[0x0][0x174] + param_HWN : c[0x0][0x178] + param_DHWN : c[0x0][0x17c] + param_C : c[0x0][0x180] + param_CRST : c[0x0][0x184] + param_RST : c[0x0][0x188] + param_magic_RST : c[0x0][0x18c] + param_shift_RST : c[0x0][0x190] + param_RS : c[0x0][0x194] + param_magic_RS : c[0x0][0x198] + param_shift_RS : c[0x0][0x19c] + param_S : c[0x0][0x1a0] + param_magic_S : c[0x0][0x1a4] + param_shift_S : c[0x0][0x1a8] + param_pad_d : c[0x0][0x1ac] + param_pad_h : c[0x0][0x1b0] + param_pad_w : c[0x0][0x1b4] + param_str_d : c[0x0][0x1b8] + param_str_h : c[0x0][0x1bc] + param_str_w : c[0x0][0x1c0] + param_dil_d : c[0x0][0x1c4] + param_dil_h : c[0x0][0x1c8] + param_dil_w : c[0x0][0x1cc] + param_P : c[0x0][0x1d0] + param_Q : c[0x0][0x1d4] + param_PQ : c[0x0][0x1d8] + param_QN : c[0x0][0x1dc] + param_PQN : c[0x0][0x1e0] + param_MPQN : c[0x0][0x1e4] + param_magic_Q : c[0x0][0x1e8] + param_shift_Q : c[0x0][0x1ec] + param_magic_PQ : c[0x0][0x1f0] + param_shift_PQ : c[0x0][0x1f4] + param_grid_P : c[0x0][0x1f8] + param_grid_Q : c[0x0][0x1fc] + param_grid_PQ : c[0x0][0x200] + param_CRSTK : c[0x0][0x204] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-65 : one + 64-65 : blkIE<0-1> + 64-68 : blkI, blkE, tid, tidX, tidY + 69-95 ~ blkMPQ, tid1, tid7, tid128, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 69-95 ~ c, z, y, x, k, te, mt, pr, qs, r, s, t, rs, rst, crst, ti, xw, xW, yh, yH, zd, zD, cC, nextP, nextQ, Q + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 96-99 : loadI<0-3> + 96-99 : storeI<0-3> + 100-103 : loadI<4-7> + 112-115 : storeI<4-7> + + 104-107 : loadE<0-3> + 104-107 : storeE<0-3> + 108-111 : loadE<4-7> + 112-115 : storeE<4-7> + + 116-119 : trackI<0-1>, trackE<0-1> + + 120-124 ~ writeS, loopN, m, p, q + 125-127 ~ readIs, readEs, swapBuf + + 72-87 : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 88-124 ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128, blk_MPQ, CRSTK, xmad_determ + + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID1; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkI, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; +--:-:-:-:5 BRA.U END_CTAID1; +CTAID1: +--:-:2:-:1 S2R blkMPQ, SR_CTAID.Z; +--:-:3:-:1 S2R blkI, SR_CTAID.X; +--:-:4:-:1 S2R blkE, SR_CTAID.Y; +END_CTAID1: + + +// tidX = tid >> 1 +// tidY = (tid & 1) << 3 +// shiftX = (tid & 1) << 4 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tid1, 3; +--:-:-:-:1 SHL shiftX, tid1, 4; + +0c:-:-:-:1 STS.64 [addr_blkIE], blkIE; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; ++] + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_grid_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_grid_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; + +--:-:-:-:1 STS [addr_q], q; + +// writeS = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 IADD writeS, writeS, shiftX; +--:-:-:-:1 ISCADD writeS, writeS, 4x, 2; + +// readIs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readIs, tid, 0x70; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readEs, tid128, 4; +--:-:-:-:1 LOP.OR readEs, readEs, tid7; +--:-:-:-:1 ISCADD readEs, readEs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + +--:-:-:-:1 MOV loopN, RZ; + +// Flag for first load branch +--:-:-:-:1 PSETP.AND.AND P0, PT, PT, PT, PT; + + + +NEXT_PQ: + +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 LDS.U.64 blkIE, [addr_blkIE]; + + +// Zigzag q but only if grid_P < P +--:-:-:-:1 LOP.AND.NZ P1, RZ, p, 1; +--:-:-:-:1 MOV Q, param_grid_P; +--:-:-:-:1 ISETP.LT.AND P1, PT, Q, param_P, P1; +--:-:-:-:1 MOV Q, -1; +--:-:-:-:1 @P1 IADD3 Q, -q, param_Q, Q; +--:-:-:-:1 @!P1 MOV Q, q; +// tidX = tid >> 1 +// tidY = (tid & 1) << 3 +--:-:-:-:1 LOP.AND tidY, tid, 1; +02:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tidY, 3; +// crst = blockI*128 + tid +04:-:-:-:1 ISCADD crst, blkI, tidX, 7; +// k = blockE*128 + tid +04:-:-:-:1 ISCADD k, blkE, tidX, 7; +--:-:-:-:1 IADD k, k, param_offset_K; + +// c = crst / RST +// rst = crst % RST +--:-:-:-:1 XMAD.LO2C c, crst, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c, c, param_shift_RST; +--:-:-:-:1 XMAD rst, c, param_RST, RZ; +--:-:-:-:1 IADD rst, -rst, crst; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = q * v - pad_w + (s * dil_w) +// y = p * u - pad_h + (r * dil_h) +// z = m * w - pad_d + (t * dil_d) +--:-:-:-:1 XMAD qs, Q, param_str_w, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 IADD x, x, -param_pad_w; +--:-:-:-:1 IADD y, y, -param_pad_h; +--:-:-:-:1 IADD z, z, -param_pad_d; +// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY +--:-:-:-:1 XMAD ti, x, param_N, tidY; +--:-:-:-:1 XMAD.LO2C ti, y, param_WN, ti; +--:-:-:-:1 XMAD.LO2C ti, z, param_HWN, ti; +--:-:-:-:1 XMAD.LO2C ti, c, param_DHWN, ti; +--:-:-:-:1 LEA trackI0.CC, ti, param_I[0], 1; +--:-:-:-:1 LEA.HI.X trackI1, ti, param_I[1], RZ, 1; +// trackE = k*MPQN + m*PQN + p*QN + tidY +--:-:-:-:1 XMAD te, Q, param_N, tidY; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te, k, param_MPQN, te; +--:-:-:-:1 LEA trackE0.CC, te, param_E[0], 1; +--:-:-:-:1 LEA.HI.X trackE1, te, param_E[1], RZ, 1; +// Bounds check x,y,z,c for each I track. +// If out of bounds, this will set the track address to -1 +--:-:-:-:1 ISET.GE.AND cC, c, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd, z, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD, z, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh, y, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH, y, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw, x, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW, x, param_W, PT; +--:-:-:-:1 LOP.OR trackI0, trackI0, cC; +--:-:-:-:1 LOP3.LUT trackI0, trackI0, zd, zD, 0xfe; +--:-:-:-:1 LOP3.LUT trackI0, trackI0, yh, yH, 0xfe; +--:-:-:-:1 LOP3.LUT trackI0, trackI0, xw, xW, 0xfe; + +01:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:0 ISETP.NE.AND P2, PT, trackI0, -1, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, k, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, nextP, param_P, PT; +--:-:-:-:1 IADD loopN, loopN, param_N; + + +--:-:-:Y:5 @P0 BRA.U FIRST_LOAD; + +INIT_LOOP: + +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*128 + 64>]; +--:-:1:-:2 LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + +NEXT_16N: + +[+ + + our $convert; + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD loopN, loopN, -16;\n", + + # p0 = (N & 16) == 0 + # p1 = N >= 32 && p0 + j0c14 => "--:-:-:-:1 LOP.AND.NZ P0, RZ, loopN, 16;\n", + j0c28 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 32, P0;\n", + + j2c34 => "--:-:-:-:1 \@P0 $convert storeI7, loadI7.H1;\n", + j2c38 => "--:-:-:-:1 \@P0 $convert storeI6, loadI7.H0;\n", + j2c42 => "--:-:-:-:1 \@P0 $convert storeI5, loadI6.H1;\n", + j2c46 => "--:-:-:-:1 \@P0 $convert storeI4, loadI6.H0;\n", + j2c50 => "--:-:-:-:1 \@P0 $convert storeI3, loadI5.H1;\n", + j2c54 => "--:-:-:-:1 \@P0 $convert storeI2, loadI5.H0;\n", + j2c58 => "--:-:-:-:1 \@P0 $convert storeI1, loadI4.H1;\n", + j2c62 => "--:-:-:-:1 \@P0 $convert storeI0, loadI4.H0;\n", + + j3c34 => "02:-:-:-:1 \@!P0 $convert storeI7, loadI3.H1;\n", + j3c38 => "--:-:-:-:1 \@!P0 $convert storeI6, loadI3.H0;\n", + j3c42 => "--:-:-:-:1 \@!P0 $convert storeI5, loadI2.H1;\n", + j3c46 => "--:-:5:-:1 \@!P0 $convert storeI4, loadI2.H0;\n", + j3c50 => "--:-:-:-:1 \@!P0 $convert storeI3, loadI1.H1;\n", + j3c54 => "--:-:-:-:1 \@!P0 $convert storeI2, loadI1.H0;\n", + j3c58 => "--:-:-:-:1 \@!P0 $convert storeI1, loadI0.H1;\n", + j3c62 => "--:-:2:-:1 \@!P0 $convert storeI0, loadI0.H0;\n", + + j4c8 => "10:-:-:-:1 STS [writeS + 4x<7*128>], storeI7;\n", + j4c10 => "--:-:-:-:1 STS [writeS + 4x<6*128>], storeI6;\n", + j4c12 => "--:-:-:-:1 STS [writeS + 4x<5*128>], storeI5;\n", + j4c14 => "--:-:-:-:1 STS [writeS + 4x<4*128>], storeI4;\n", + j4c16 => "02:-:-:-:1 STS [writeS + 4x<3*128>], storeI3;\n", + j4c18 => "--:-:-:-:1 STS [writeS + 4x<2*128>], storeI2;\n", + j4c20 => "--:-:-:-:1 STS [writeS + 4x<1*128>], storeI1;\n", + j4c22 => "--:2:-:-:1 STS [writeS + 4x<0*128>], storeI0;\n", + + j4c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, trackI0, -1, P1;\n", + j4c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, trackI0, -1, P1;\n", + + j5c8 => "02:-:-:-:1 \@P2 LDG.E.CI.128 loadI0, [trackI + 2x< 0>];\n", + j5c10 => "--:5:2:-:1 \@P2 LDG.E.CI.128 loadI4, [trackI + 2x<16>];\n", + + j6c8 => "--:-:-:-:1 \@P3 LDS.U.128 loadI0, [addr_zero];\n", + j7c8 => "--:-:-:-:1 \@P3 LDS.U.128 loadI4, [addr_zero];\n", + + j7c57 => "10:-:-:-:1 \@P2 IADD trackI0.CC, trackI0, 2x<32>;\n", + j7c63 => "--:-:-:-:1 \@P2 IADD.X trackI1, trackI1, RZ;\n", + + + j10c34 => "--:-:-:-:1 \@P0 $convert storeE7, loadE7.H1;\n", + j10c38 => "--:-:-:-:1 \@P0 $convert storeE6, loadE7.H0;\n", + j10c42 => "--:-:-:-:1 \@P0 $convert storeE5, loadE6.H1;\n", + j10c46 => "--:-:-:-:1 \@P0 $convert storeE4, loadE6.H0;\n", + j10c50 => "--:-:-:-:1 \@P0 $convert storeE3, loadE5.H1;\n", + j10c54 => "--:-:-:-:1 \@P0 $convert storeE2, loadE5.H0;\n", + j10c58 => "--:-:-:-:1 \@P0 $convert storeE1, loadE4.H1;\n", + j10c62 => "--:-:-:-:1 \@P0 $convert storeE0, loadE4.H0;\n", + + j11c34 => "04:-:-:-:1 \@!P0 $convert storeE7, loadE3.H1;\n", + j11c38 => "--:-:-:-:1 \@!P0 $convert storeE6, loadE3.H0;\n", + j11c42 => "--:-:-:-:1 \@!P0 $convert storeE5, loadE2.H1;\n", + j11c46 => "--:-:5:-:1 \@!P0 $convert storeE4, loadE2.H0;\n", + j11c50 => "--:-:-:-:1 \@!P0 $convert storeE3, loadE1.H1;\n", + j11c54 => "--:-:-:-:1 \@!P0 $convert storeE2, loadE1.H0;\n", + j11c58 => "--:-:-:-:1 \@!P0 $convert storeE1, loadE0.H1;\n", + j11c62 => "--:-:3:-:1 \@!P0 $convert storeE0, loadE0.H0;\n", + + j12c8 => "10:-:-:-:1 STS [writeS + 4x<7*128 + szBuf>], storeE7;\n", + j12c10 => "--:-:-:-:1 STS [writeS + 4x<6*128 + szBuf>], storeE6;\n", + j12c12 => "--:-:-:-:1 STS [writeS + 4x<5*128 + szBuf>], storeE5;\n", + j12c14 => "--:-:-:-:1 STS [writeS + 4x<4*128 + szBuf>], storeE4;\n", + j12c16 => "04:-:-:-:1 STS [writeS + 4x<3*128 + szBuf>], storeE3;\n", + j12c18 => "--:-:-:-:1 STS [writeS + 4x<2*128 + szBuf>], storeE2;\n", + j12c20 => "--:-:-:-:1 STS [writeS + 4x<1*128 + szBuf>], storeE1;\n", + j12c22 => "--:3:-:-:1 STS [writeS + 4x<0*128 + szBuf>], storeE0;\n", + + j12c24 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P1, P4, PT;\n", + + j13c8 => "04:-:-:-:1 \@P2 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];\n", + j13c10 => "--:5:3:-:1 \@P2 LDG.E.CI.128 loadE4, [trackE + 2x<16>];\n", + + j15c57 => "10:-:-:-:1 \@P2 IADD trackE0.CC, trackE0, 2x<32>;\n", + j15c62 => "--:-:-:-:1 \@P2 IADD.X trackE1, trackE1, RZ;\n", + + # p0 = N >= 16 and not (N == 32 and (p or q)) + j14c8 => "--:-:-:-:1 ISETP.EQ.AND P0, PT, loopN, 32, PT;\n", + j14c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 16, PT;\n", + j14c22 => "--:-:-:-:1 PSETP.OR.AND P0, PT, P5, P6, P0;\n", + j14c35 => "--:-:-:-:1 PSETP.AND.AND P0, PT, !P0, P1, PT;\n", + + j14c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "20:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j15c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_16N;\n" . + "--:-:-:-:0 \@P5 IADD q, q, param_grid_Q;\n" . + "01:-:-:Y:5 \@P5 BRA.U NEXT_PQ;\n" . + "--:-:1:-:1 \@P6 LDS q, [addr_q];\n" . + "--:-:-:-:0 \@P6 IADD p, p, param_grid_P;\n" . + "--:-:-:Y:5 \@P6 BRA.U NEXT_PQ;\n" . + "--:-:-:Y:5 BRA.U FINISH;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 8 ? 0 : 1; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + ++] + +FIRST_LOAD: + +--:-:-:-:8 PSETP.AND.AND P0, PT, PT, PT, !PT; + +// p1 = N == 32 and (p or q) +--:-:-:-:0 ISETP.EQ.AND P1, PT, loopN, 32, PT; + +--:-:-:-:1 @P2 LDG.E.CI.128 loadI0, [trackI + 2x< 0>]; +--:-:1:-:1 @P2 LDG.E.CI.128 loadI4, [trackI + 2x<16>]; +--:-:-:-:1 @!P2 LDS.U.128 loadI0, [addr_zero]; +--:-:5:-:1 @!P2 LDS.U.128 loadI4, [addr_zero]; + +--:-:-:-:1 @P4 LDG.E.CI.128 loadE0, [trackE + 2x< 0>]; +--:-:2:-:1 @P4 LDG.E.CI.128 loadE4, [trackE + 2x<16>]; +--:-:-:-:1 @!P4 LDS.U.128 loadE0, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 loadE4, [addr_zero]; + +11:-:-:-:1 [+ convert() +] storeI7, loadI3.H1; +--:-:-:-:1 [+ convert() +] storeI6, loadI3.H0; +--:-:-:-:1 [+ convert() +] storeI5, loadI2.H1; +--:-:1:-:1 [+ convert() +] storeI4, loadI2.H0; +--:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:5:-:1 [+ convert() +] storeI0, loadI0.H0; + +--:-:-:-:1 PSETP.OR.AND P1, PT, P5, P6, P1; + +--:-:-:-:0 @P2 IADD trackI0.CC, trackI0, 2x<32>; + +01:-:-:-:1 STS [writeS + 4x<7*128>], storeI7; +--:-:-:-:1 STS [writeS + 4x<6*128>], storeI6; +--:-:-:-:1 STS [writeS + 4x<5*128>], storeI5; +--:-:-:-:1 STS [writeS + 4x<4*128>], storeI4; +10:-:-:-:1 STS [writeS + 4x<3*128>], storeI3; +--:-:-:-:1 STS [writeS + 4x<2*128>], storeI2; +--:-:-:-:1 STS [writeS + 4x<1*128>], storeI1; +--:1:-:-:2 STS [writeS + 4x<0*128>], storeI0; + +--:-:-:-:0 @P2 IADD.X trackI1, trackI1, RZ; + +23:-:-:-:1 [+ convert() +] storeE7, loadE3.H1; +--:-:-:-:1 [+ convert() +] storeE6, loadE3.H0; +--:-:-:-:1 [+ convert() +] storeE5, loadE2.H1; +--:-:2:-:1 [+ convert() +] storeE4, loadE2.H0; +--:-:-:-:1 [+ convert() +] storeE3, loadE1.H1; +--:-:-:-:1 [+ convert() +] storeE2, loadE1.H0; +--:-:-:-:1 [+ convert() +] storeE1, loadE0.H1; +--:-:6:-:1 [+ convert() +] storeE0, loadE0.H0; + +--:-:-:-:2 PSETP.AND.AND P5, PT, P5, P1, PT; +--:-:-:-:1 PSETP.AND.AND P6, PT, P6, P1, PT; +--:-:-:-:0 @P4 IADD trackE0.CC, trackE0, 2x<32>; + +02:-:-:-:1 STS [writeS + 4x<7*128 + szBuf>], storeE7; +--:-:-:-:1 STS [writeS + 4x<6*128 + szBuf>], storeE6; +--:-:-:-:1 STS [writeS + 4x<5*128 + szBuf>], storeE5; +--:-:-:-:1 STS [writeS + 4x<4*128 + szBuf>], storeE4; +20:-:-:-:1 STS [writeS + 4x<3*128 + szBuf>], storeE3; +--:-:-:-:1 STS [writeS + 4x<2*128 + szBuf>], storeE2; +--:-:-:-:1 STS [writeS + 4x<1*128 + szBuf>], storeE1; +--:1:-:-:1 STS [writeS + 4x<0*128 + szBuf>], storeE0; + +--:-:-:-:1 @P4 IADD.X trackE1, trackE1, RZ; + +--:-:-:-:1 IADD readEs, readEs, -swapBuf; +--:-:-:-:0 IADD readIs, readIs, -swapBuf; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:0 @P5 IADD q, q, param_grid_Q; +--:-:-:Y:5 @P5 BRA.U NEXT_PQ; +--:-:-:-:0 @P6 IADD p, p, param_grid_P; +--:-:-:Y:5 @P6 BRA.U NEXT_PQ; + +--:-:-:-:2 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:Y:5 BRA.U INIT_LOOP; + + +FINISH: + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID2; +--:-:2:-:1 S2R blkI, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.X; +--:-:-:-:5 BRA.U END_CTAID2; +CTAID2: +--:-:2:-:1 S2R blkI, SR_CTAID.X; +--:-:3:-:1 S2R blkE, SR_CTAID.Y; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.Z; +END_CTAID2: + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readEs, readEs, -4x; +--:-:-:-:1 @P0 IADD readIs, readIs, -swapBuf; +--:-:-:-:1 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 128 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 5; + +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +01:-:-:-:1 LOP.AND t128, tid, 128; + +// kk = tid31 | (t128 >> 2); +--:-:-:-:1 SHR.U32 kk, t128, 2; +--:-:-:-:1 LOP.OR kk, tid31, kk; + +// readCs = ((tid96 << 4) | kk) << 2; +--:-:-:-:1 SHL readCs, tid96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, kk; +--:-:-:-:1 SHL readCs, readCs, 2; + +// kk += blkE*128; +04:-:-:-:1 ISCADD kk, blkE, kk, 7; +--:-:-:-:1 IADD kk, kk, param_offset_K; + +// crst = blkI*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 crst00, tid96, 1; +02:-:-:-:1 ISCADD crst00, blkI, crst00, 7; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K4, K, 4; +--:-:-:-:1 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +--:-:-:-:1 VMAD.U16.U16 tf, crst00, K, kk; +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSTK, param_CRSTK; +08:-:-:-:1 XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ; + }; + } + return ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 0x2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 0x2; + +// kk < K +--:-:-:-:1 ISETP.LT.AND P5, PT, kk, param_K, PT; +--:-:-:-:1 IADD kk, kk, 64; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, PT; + +--:-:-:-:1 MOV alpha, param_alpha; + + + +--:-:-:-:6 IADD track04F0.CC, track00F0, K4; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:6 IADD track08F0.CC, track04F0, K4; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:6 IADD track12F0.CC, track08F0, K4; +--:-:-:-:0 IADD.X track12F1, track08F1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + +[+ + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD track00F0.CC, track00F0, K60;\n" . + "--:-:-:-:1 IADD crst00, crst00, 60;\n" . + "--:-:-:-:1 IADD.X track00F1, track00F1, RZ;\n" . + "--:-:-:-:5 IADD track04F0.CC, track04F0, K60;\n" . + "--:-:-:-:1 IADD crst04, crst04, 60;\n" . + "--:-:-:-:1 IADD.X track04F1, track04F1, RZ;\n" . + "--:-:-:-:5 IADD track08F0.CC, track08F0, K60;\n" . + "--:-:-:-:1 IADD crst08, crst08, 60;\n" . + "--:-:-:-:1 IADD.X track08F1, track08F1, RZ;\n" . + "--:-:-:-:5 IADD track12F0.CC, track12F0, K60;\n" . + "--:-:-:-:1 IADD crst12, crst12, 60;\n" . + "--:-:-:-:1 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL f0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL f1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL f2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL f3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL f4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL f5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL f6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL f7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + ++] + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +--:-:-:-:0 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], f0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], f4; + +--:-:1:-:1 LDS f0, [readCs + 4x<0*128 + 00>]; +--:-:2:-:1 LDS f2, [readCs + 4x<1*128 + 00>]; +--:-:3:-:1 LDS f4, [readCs + 4x<2*128 + 00>]; +--:-:4:-:a LDS f6, [readCs + 4x<3*128 + 00>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:-:-:-:1 @P0 STG.E.CG [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 STG.E.CG [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 STG.E.CG [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 STG.E.CG [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } + else + { + return q{ +01:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } ++] + +--:-:1:-:1 LDS f1, [readCs + 4x<0*128 + 64>]; +--:-:2:-:1 LDS f3, [readCs + 4x<1*128 + 64>]; +--:-:3:-:1 LDS f5, [readCs + 4x<2*128 + 64>]; +--:-:4:-:a LDS f7, [readCs + 4x<3*128 + 64>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:1:-:-:1 @P0 STG.E.CG [track00F + 4x<64>], f1; +02:2:-:-:1 @P1 STG.E.CG [track04F + 4x<64>], f3; +04:3:-:-:1 @P2 STG.E.CG [track08F + 4x<64>], f5; +08:4:-:-:1 @P3 STG.E.CG [track12F + 4x<64>], f7; + }; + } + else + { + return q{ +01:1:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], f1; +02:2:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], f3; +04:3:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], f5; +08:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], f7; + }; + } ++] + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Maxwell/hconv_updat_C128_K64.sass b/Kernel/Convolution/Maxwell/hconv_updat_C128_K64.sass new file mode 100644 index 0000000..a40fcb8 --- /dev/null +++ b/Kernel/Convolution/Maxwell/hconv_updat_C128_K64.sass @@ -0,0 +1,860 @@ +# Kernel: hconv_updat_C128_K64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2> + szShareI : (128*16 + 32) + szShareE : (64*16 + 32) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_offset_K : c[0x0][0x15c] + param_N : c[0x0][0x160] + param_K : c[0x0][0x164] + param_D : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_WN : c[0x0][0x174] + param_HWN : c[0x0][0x178] + param_DHWN : c[0x0][0x17c] + param_C : c[0x0][0x180] + param_CRST : c[0x0][0x184] + param_RST : c[0x0][0x188] + param_magic_RST : c[0x0][0x18c] + param_shift_RST : c[0x0][0x190] + param_RS : c[0x0][0x194] + param_magic_RS : c[0x0][0x198] + param_shift_RS : c[0x0][0x19c] + param_S : c[0x0][0x1a0] + param_magic_S : c[0x0][0x1a4] + param_shift_S : c[0x0][0x1a8] + param_pad_d : c[0x0][0x1ac] + param_pad_h : c[0x0][0x1b0] + param_pad_w : c[0x0][0x1b4] + param_str_d : c[0x0][0x1b8] + param_str_h : c[0x0][0x1bc] + param_str_w : c[0x0][0x1c0] + param_dil_d : c[0x0][0x1c4] + param_dil_h : c[0x0][0x1c8] + param_dil_w : c[0x0][0x1cc] + param_P : c[0x0][0x1d0] + param_Q : c[0x0][0x1d4] + param_PQ : c[0x0][0x1d8] + param_QN : c[0x0][0x1dc] + param_PQN : c[0x0][0x1e0] + param_MPQN : c[0x0][0x1e4] + param_magic_Q : c[0x0][0x1e8] + param_shift_Q : c[0x0][0x1ec] + param_magic_PQ : c[0x0][0x1f0] + param_shift_PQ : c[0x0][0x1f4] + param_grid_P : c[0x0][0x1f8] + param_grid_Q : c[0x0][0x1fc] + param_grid_PQ : c[0x0][0x200] + param_CRSTK : c[0x0][0x204] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkI, blkE, one + 68-99 ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-72 ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q + 73-99 ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1> + 73-99 ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 100-131 : load0I<0-7>, load1I<0-7>, loadE<0-7>, storeX<0-7> + 132-137 : track0I<0-1>, track1I<0-1>, trackE<0-1> + + 138-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY + 165-167 ~ readIs, readEs, swapBuf + + 68-83 : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID1; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkI, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; +--:-:-:-:5 BRA.U END_CTAID1; +CTAID1: +--:-:2:-:1 S2R blkMPQ, SR_CTAID.Z; +--:-:3:-:1 S2R blkI, SR_CTAID.X; +--:-:4:-:1 S2R blkE, SR_CTAID.Y; +END_CTAID1: + + +// tidX = tid >> 1 +// tidY = (tid & 1) << 3 +// shiftX = (tid & 1) << 4 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tid1, 3; +--:-:-:-:1 SHL shiftX, tid1, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_grid_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_grid_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; +--:-:-:-:1 MOV qq, q; + +// writeIs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 7; +--:-:-:-:1 IADD writeIs, writeIs, shiftX; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +// writeEs = (tidY*64 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeEs, tidY, tidX, 6; +--:-:-:-:1 IADD writeEs, writeEs, shiftX; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x, 2; + +// readIs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readIs, tid, -16; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; +--:-:-:-:1 SHL readIs, readIs, 4; +// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +--:-:-:-:1 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readEs, readEs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + +// crst = blockI*128 + tid +04:-:-:-:1 ISCADD crst0, blkI, tidX, 7; +--:-:-:-:1 IADD crst1, crst0, 64; + +// k = blockE*64 + tid +08:-:-:-:1 ISCADD k, blkE, tidX, 6; +--:-:-:-:1 IADD k, k, param_offset_K; + +--:-:-:-:1 MOV loopN, RZ; + +// Flag for first load branch +--:-:-:-:1 PSETP.AND.AND P0, PT, PT, PT, PT; + + +NEXT_PQ: + + +// Zigzag q but only if grid_P < P +--:-:-:-:1 LOP.AND.NZ P1, RZ, p, 1; +--:-:-:-:1 MOV Q, param_grid_P; +--:-:-:-:1 ISETP.LT.AND P1, PT, Q, param_P, P1; +--:-:-:-:1 MOV Q, -1; +--:-:-:-:1 @P1 IADD3 Q, -q, param_Q, Q; +--:-:-:-:1 @!P1 MOV Q, q; +// c = crst / RST +// rst = crst % RST +--:-:-:-:1 XMAD.LO2C c0, crst0, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c0, c0, param_shift_RST; +--:-:-:-:1 XMAD rst0, c0, param_RST, RZ; +--:-:-:-:1 IADD rst0, -rst0, crst0; +--:-:-:-:1 XMAD.LO2C c1, crst1, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c1, c1, param_shift_RST; +--:-:-:-:1 XMAD rst1, c1, param_RST, RZ; +--:-:-:-:1 IADD rst1, -rst1, crst1; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t0, rst0, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t0, t0, param_shift_RS; +--:-:-:-:1 XMAD rs0, t0, param_RS, RZ; +--:-:-:-:1 IADD rs0, -rs0, rst0; +--:-:-:-:1 XMAD.LO2C t1, rst1, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t1, t1, param_shift_RS; +--:-:-:-:1 XMAD rs1, t1, param_RS, RZ; +--:-:-:-:1 IADD rs1, -rs1, rst1; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r0, rs0, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r0, r0, param_shift_S; +--:-:-:-:1 XMAD s0, r0, param_S, RZ; +--:-:-:-:1 IADD s0, -s0, rs0; +--:-:-:-:1 XMAD.LO2C r1, rs1, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r1, r1, param_shift_S; +--:-:-:-:1 XMAD s1, r1, param_S, RZ; +--:-:-:-:1 IADD s1, -s1, rs1; +// z = m * w - pad_d + t +// y = p * u - pad_h + r +// x = q * v - pad_w + s +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, Q, param_str_w, RZ; +--:-:-:-:1 XMAD z1, t1, param_dil_d, mt; +--:-:-:-:1 XMAD y1, r1, param_dil_h, pr; +--:-:-:-:1 XMAD x1, s1, param_dil_w, qs; +--:-:-:-:1 XMAD z0, t0, param_dil_d, mt; +--:-:-:-:1 XMAD y1, r0, param_dil_h, pr; +--:-:-:-:1 XMAD x1, s0, param_str_w, qs; +--:-:-:-:1 IADD z1, z1, -param_pad_d; +--:-:-:-:1 IADD y1, y1, -param_pad_h; +--:-:-:-:1 IADD x1, x1, -param_pad_w; +--:-:-:-:1 IADD z0, z0, -param_pad_d; +--:-:-:-:1 IADD y0, y0, -param_pad_h; +--:-:-:-:1 IADD x0, x0, -param_pad_w; + + +// Split blocks to fit inside of 36 registers + + +// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY +--:-:-:-:1 XMAD.LO2C ti0, c0, param_DHWN, tidY; +--:-:-:-:1 XMAD.LO2C ti0, z0, param_HWN, ti0; +--:-:-:-:1 XMAD.LO2C ti0, y0, param_WN, ti0; +--:-:-:-:1 XMAD ti0, x0, param_N, ti0; +--:-:-:-:1 XMAD.LO2C ti1, c1, param_DHWN, tidY; +--:-:-:-:1 XMAD.LO2C ti1, z1, param_HWN, ti1; +--:-:-:-:1 XMAD.LO2C ti1, y1, param_WN, ti1; +--:-:-:-:1 XMAD ti1, x1, param_N, ti1; +--:-:-:-:1 LEA track0I0.CC, ti0, param_I[0], 1; +--:-:-:-:1 LEA.HI.X track0I1, ti0, param_I[1], RZ, 1; +--:-:-:-:1 LEA track1I0.CC, ti1, param_I[0], 1; +--:-:-:-:1 LEA.HI.X track1I1, ti1, param_I[1], RZ, 1; + +// trackE = k*MPQN + m*PQN + p*QN + tidY +--:-:-:-:1 XMAD.LO2C te, k, param_MPQN, tidY; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD te, Q, param_N, te; +--:-:-:-:1 LEA trackE0.CC, te, param_E[0], 1; +--:-:-:-:0 LEA.HI.X trackE1, te, param_E[1], RZ, 1; + +// Bounds check x,y,z,c for each I track. +// If out of bounds, this will set the track address to -1 +--:-:-:-:1 ISET.GE.AND cC0, c0, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd0, z0, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD0, z0, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh0, y0, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH0, y0, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw0, x0, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW0, x0, param_W, PT; +--:-:-:-:1 LOP.OR track0I0, track0I0, cC0; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe; + +--:-:-:-:1 ISET.GE.AND cC1, c1, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd1, z1, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD1, z1, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh1, y1, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH1, y1, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw1, x1, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW1, x1, param_W, PT; +--:-:-:-:1 LOP.OR track1I0, track1I0, cC1; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:1 ISETP.NE.AND P2, PT, track0I0, -1, PT; +--:-:-:-:0 ISETP.NE.AND P3, PT, track1I0, -1, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, k, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:-:1 IADD loopN, loopN, param_N; + + +--:-:-:Y:5 @P0 BRA.U FIRST_LOAD; + +INIT_LOOP: + +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +--:-:1:-:2 LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + +NEXT_16N: + +[+ + + our $convert; + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD loopN, loopN, -16;\n", + + # p0 = (N & 16) == 0 + # p1 = N >= 32 && p0 + j0c14 => "--:-:-:-:1 LOP.AND.NZ P0, RZ, loopN, 16;\n", + j0c28 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 32, P0;\n", + + j0c34 => "--:-:-:-:1 \@P0 $convert storeX7, load0I7.H1;\n", + j0c38 => "--:-:-:-:1 \@P0 $convert storeX6, load0I7.H0;\n", + j0c42 => "--:-:-:-:1 \@P0 $convert storeX5, load0I6.H1;\n", + j0c46 => "--:-:-:-:1 \@P0 $convert storeX4, load0I6.H0;\n", + j0c50 => "--:-:-:-:1 \@P0 $convert storeX3, load0I5.H1;\n", + j0c54 => "--:-:-:-:1 \@P0 $convert storeX2, load0I5.H0;\n", + j0c58 => "--:-:-:-:1 \@P0 $convert storeX1, load0I4.H1;\n", + j0c62 => "--:-:-:-:1 \@P0 $convert storeX0, load0I4.H0;\n", + + j1c34 => "02:-:-:-:1 \@!P0 $convert storeX7, load0I3.H1;\n", + j1c38 => "--:-:-:-:1 \@!P0 $convert storeX6, load0I3.H0;\n", + j1c42 => "--:-:-:-:1 \@!P0 $convert storeX5, load0I2.H1;\n", + j1c46 => "--:-:5:-:1 \@!P0 $convert storeX4, load0I2.H0;\n", + j1c50 => "--:-:-:-:1 \@!P0 $convert storeX3, load0I1.H1;\n", + j1c54 => "--:-:-:-:1 \@!P0 $convert storeX2, load0I1.H0;\n", + j1c58 => "--:-:-:-:1 \@!P0 $convert storeX1, load0I0.H1;\n", + j1c62 => "--:-:2:-:1 \@!P0 $convert storeX0, load0I0.H0;\n", + + j2c8 => "10:-:-:-:1 STS [writeIs + 4x<7*128 + 0>], storeX7;\n", + j2c10 => "--:-:-:-:1 STS [writeIs + 4x<6*128 + 0>], storeX6;\n", + j2c12 => "--:-:-:-:1 STS [writeIs + 4x<5*128 + 0>], storeX5;\n", + j2c14 => "--:-:-:-:1 STS [writeIs + 4x<4*128 + 0>], storeX4;\n", + j2c16 => "02:-:-:-:1 STS [writeIs + 4x<3*128 + 0>], storeX3;\n", + j2c18 => "--:-:-:-:1 STS [writeIs + 4x<2*128 + 0>], storeX2;\n", + j2c20 => "--:-:-:-:1 STS [writeIs + 4x<1*128 + 0>], storeX1;\n", + j2c22 => "--:2:-:-:1 STS [writeIs + 4x<0*128 + 0>], storeX0;\n", + + j2c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, track0I0, -1, P1;\n", + j2c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n", + + j3c8 => "02:-:-:-:1 \@P2 LDG.E.CI.128 load0I0, [track0I + 2x< 0>];\n", + j3c10 => "--:5:2:-:1 \@P2 LDG.E.CI.128 load0I4, [track0I + 2x<16>];\n", + + j4c8 => "--:-:-:-:1 \@P3 LDS.U.128 load0I0, [addr_zero];\n", + j5c8 => "--:-:-:-:1 \@P3 LDS.U.128 load0I4, [addr_zero];\n", + + j5c57 => "10:-:-:-:1 \@P2 IADD track0I0.CC, track0I0, 2x<32>;\n", + j5c63 => "--:-:-:-:1 \@P2 IADD.X track0I1, track0I1, RZ;\n", + + + j5c34 => "--:-:-:-:1 \@P0 $convert storeX7, load1I7.H1;\n", + j5c38 => "--:-:-:-:1 \@P0 $convert storeX6, load1I7.H0;\n", + j5c42 => "--:-:-:-:1 \@P0 $convert storeX5, load1I6.H1;\n", + j5c46 => "--:-:-:-:1 \@P0 $convert storeX4, load1I6.H0;\n", + j5c50 => "--:-:-:-:1 \@P0 $convert storeX3, load1I5.H1;\n", + j5c54 => "--:-:-:-:1 \@P0 $convert storeX2, load1I5.H0;\n", + j5c58 => "--:-:-:-:1 \@P0 $convert storeX1, load1I4.H1;\n", + j5c62 => "--:-:-:-:1 \@P0 $convert storeX0, load1I4.H0;\n", + + j6c34 => "04:-:-:-:1 \@!P0 $convert storeX7, load1I3.H1;\n", + j6c38 => "--:-:-:-:1 \@!P0 $convert storeX6, load1I3.H0;\n", + j6c42 => "--:-:-:-:1 \@!P0 $convert storeX5, load1I2.H1;\n", + j6c46 => "--:-:5:-:1 \@!P0 $convert storeX4, load1I2.H0;\n", + j6c50 => "--:-:-:-:1 \@!P0 $convert storeX3, load1I1.H1;\n", + j6c54 => "--:-:-:-:1 \@!P0 $convert storeX2, load1I1.H0;\n", + j6c58 => "--:-:-:-:1 \@!P0 $convert storeX1, load1I0.H1;\n", + j6c62 => "--:-:3:-:1 \@!P0 $convert storeX0, load1I0.H0;\n", + + j7c8 => "10:-:-:-:1 STS [writeIs + 4x<7*128 + 64>], storeX7;\n", + j7c10 => "--:-:-:-:1 STS [writeIs + 4x<6*128 + 64>], storeX6;\n", + j7c12 => "--:-:-:-:1 STS [writeIs + 4x<5*128 + 64>], storeX5;\n", + j7c14 => "--:-:-:-:1 STS [writeIs + 4x<4*128 + 64>], storeX4;\n", + j7c16 => "04:-:-:-:1 STS [writeIs + 4x<3*128 + 64>], storeX3;\n", + j7c18 => "--:-:-:-:1 STS [writeIs + 4x<2*128 + 64>], storeX2;\n", + j7c20 => "--:-:-:-:1 STS [writeIs + 4x<1*128 + 64>], storeX1;\n", + j7c22 => "--:3:-:-:1 STS [writeIs + 4x<0*128 + 64>], storeX0;\n", + + j7c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, track1I0, -1, P1;\n", + j7c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n", + + j8c8 => "04:-:-:-:1 \@P2 LDG.E.CI.128 load1I0, [track1I + 2x< 0>];\n", + j8c10 => "--:5:3:-:1 \@P2 LDG.E.CI.128 load1I4, [track1I + 2x<16>];\n", + + j9c8 => "--:-:-:-:1 \@P3 LDS.U.128 load1I0, [addr_zero];\n", + j10c8 => "--:-:-:-:1 \@P3 LDS.U.128 load1I4, [addr_zero];\n", + + j10c57 => "10:-:-:-:1 \@P2 IADD track1I0.CC, track1I0, 2x<32>;\n", + j10c63 => "--:-:-:-:1 \@P2 IADD.X track1I1, track1I1, RZ;\n", + + + j10c34 => "--:-:-:-:1 \@P0 $convert storeX7, loadE7.H1;\n", + j10c38 => "--:-:-:-:1 \@P0 $convert storeX6, loadE7.H0;\n", + j10c42 => "--:-:-:-:1 \@P0 $convert storeX5, loadE6.H1;\n", + j10c46 => "--:-:-:-:1 \@P0 $convert storeX4, loadE6.H0;\n", + j10c50 => "--:-:-:-:1 \@P0 $convert storeX3, loadE5.H1;\n", + j10c54 => "--:-:-:-:1 \@P0 $convert storeX2, loadE5.H0;\n", + j10c58 => "--:-:-:-:1 \@P0 $convert storeX1, loadE4.H1;\n", + j10c62 => "--:-:-:-:1 \@P0 $convert storeX0, loadE4.H0;\n", + + j11c34 => "08:-:-:-:1 \@!P0 $convert storeX7, loadE3.H1;\n", + j11c38 => "--:-:-:-:1 \@!P0 $convert storeX6, loadE3.H0;\n", + j11c42 => "--:-:-:-:1 \@!P0 $convert storeX5, loadE2.H1;\n", + j11c46 => "--:-:5:-:1 \@!P0 $convert storeX4, loadE2.H0;\n", + j11c50 => "--:-:-:-:1 \@!P0 $convert storeX3, loadE1.H1;\n", + j11c54 => "--:-:-:-:1 \@!P0 $convert storeX2, loadE1.H0;\n", + j11c58 => "--:-:-:-:1 \@!P0 $convert storeX1, loadE0.H1;\n", + j11c62 => "--:-:4:-:1 \@!P0 $convert storeX0, loadE0.H0;\n", + + j12c8 => "10:-:-:-:1 STS [writeEs + 4x<7*64>], storeX7;\n", + j12c10 => "--:-:-:-:1 STS [writeEs + 4x<6*64>], storeX6;\n", + j12c12 => "--:-:-:-:1 STS [writeEs + 4x<5*64>], storeX5;\n", + j12c14 => "--:-:-:-:1 STS [writeEs + 4x<4*64>], storeX4;\n", + j12c16 => "08:-:-:-:1 STS [writeEs + 4x<3*64>], storeX3;\n", + j12c18 => "--:-:-:-:1 STS [writeEs + 4x<2*64>], storeX2;\n", + j12c20 => "--:-:-:-:1 STS [writeEs + 4x<1*64>], storeX1;\n", + j12c22 => "--:4:-:-:1 STS [writeEs + 4x<0*64>], storeX0;\n", + + j12c24 => "--:-:-:-:1 ISETP.LT.AND P2, PT, k, param_K, P1;\n", + + j13c8 => "08:-:-:-:1 \@P2 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];\n", + j13c10 => "--:5:4:-:1 \@P2 LDG.E.CI.128 loadE4, [trackE + 2x<16>];\n", + + j15c57 => "10:-:-:-:1 \@P2 IADD trackE0.CC, trackE0, 2x<32>;\n", + j15c62 => "--:-:-:-:1 \@P2 IADD.X trackE1, trackE1, RZ;\n", + + # p0 = N >= 16 and not (N == 32 and (p or q)) + j14c8 => "--:-:-:-:1 ISETP.EQ.AND P0, PT, loopN, 32, PT;\n", + j14c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 16, PT;\n", + j14c22 => "--:-:-:-:1 PSETP.OR.AND P0, PT, P5, P6, P0;\n", + j14c35 => "--:-:-:-:1 PSETP.AND.AND P0, PT, !P0, P1, PT;\n", + + j14c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "20:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j15c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_16N;\n" . + "--:-:-:-:0 \@P5 IADD q, q, param_grid_Q;\n" . + "01:-:-:Y:5 \@P5 BRA.U NEXT_PQ;\n" . + "--:-:-:-:1 \@P6 MOV q, qq;\n" . + "--:-:-:-:0 \@P6 IADD p, p, param_grid_P;\n" . + "--:-:-:Y:5 \@P6 BRA.U NEXT_PQ;\n" . + "--:-:-:Y:5 BRA.U FINISH;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 8 ? 0 : 1; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + ++] + +FIRST_LOAD: + +--:-:-:-:8 PSETP.AND.AND P0, PT, PT, PT, !PT; + +--:-:-:-:1 @P2 LDG.E.CI.128 load0I0, [track0I + 2x< 0>]; +--:-:1:-:1 @P2 LDG.E.CI.128 load0I4, [track0I + 2x<16>]; +--:-:-:-:1 @!P2 LDS.U.128 load0I0, [addr_zero]; +--:-:4:-:1 @!P2 LDS.U.128 load0I4, [addr_zero]; + +// p1 = N == 32 and (p or q) +--:-:-:-:0 ISETP.EQ.AND P1, PT, loopN, 32, PT; + +--:-:-:-:1 @P3 LDG.E.CI.128 load1I0, [track1I + 2x< 0>]; +--:-:2:-:1 @P3 LDG.E.CI.128 load1I4, [track1I + 2x<16>]; +--:-:-:-:1 @!P3 LDS.U.128 load1I0, [addr_zero]; +--:-:5:-:1 @!P3 LDS.U.128 load1I4, [addr_zero]; + +--:-:-:-:1 @P4 LDG.E.CI.128 loadE0, [trackE + 2x< 0>]; +--:-:3:-:1 @P4 LDG.E.CI.128 loadE4, [trackE + 2x<16>]; +--:-:-:-:1 @!P4 LDS.U.128 loadE0, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 loadE4, [addr_zero]; + + +09:-:-:-:1 [+ convert() +] storeX7, load0I3.H1; +--:-:-:-:1 [+ convert() +] storeX6, load0I3.H0; +--:-:-:-:1 [+ convert() +] storeX5, load0I2.H1; +--:-:1:-:1 [+ convert() +] storeX4, load0I2.H0; +--:-:-:-:1 [+ convert() +] storeX3, load0I1.H1; +--:-:-:-:1 [+ convert() +] storeX2, load0I1.H0; +--:-:-:-:1 [+ convert() +] storeX1, load0I0.H1; +--:-:4:-:1 [+ convert() +] storeX0, load0I0.H0; + +--:-:-:-:1 PSETP.OR.AND P1, PT, P5, P6, P1; +--:-:-:-:0 @P2 IADD track0I0.CC, track0I0, 2x<32>; + +01:-:-:-:1 STS [writeIs + 4x<7*128 + 0>], storeX7; +--:-:-:-:1 STS [writeIs + 4x<6*128 + 0>], storeX6; +--:-:-:-:1 STS [writeIs + 4x<5*128 + 0>], storeX5; +--:-:-:-:1 STS [writeIs + 4x<4*128 + 0>], storeX4; +08:-:-:-:1 STS [writeIs + 4x<3*128 + 0>], storeX3; +--:-:-:-:1 STS [writeIs + 4x<2*128 + 0>], storeX2; +--:-:-:-:1 STS [writeIs + 4x<1*128 + 0>], storeX1; +--:1:-:-:2 STS [writeIs + 4x<0*128 + 0>], storeX0; + +--:-:-:-:0 @P2 IADD.X track0I1, track0I1, RZ; + +13:-:-:-:1 [+ convert() +] storeX7, load1I3.H1; +--:-:-:-:1 [+ convert() +] storeX6, load1I3.H0; +--:-:-:-:1 [+ convert() +] storeX5, load1I2.H1; +--:-:2:-:1 [+ convert() +] storeX4, load1I2.H0; +--:-:-:-:1 [+ convert() +] storeX3, load1I1.H1; +--:-:-:-:1 [+ convert() +] storeX2, load1I1.H0; +--:-:-:-:1 [+ convert() +] storeX1, load1I0.H1; +--:-:5:-:1 [+ convert() +] storeX0, load1I0.H0; + +--:-:-:-:1 PSETP.AND.AND P5, PT, P5, P1, PT; +--:-:-:-:0 @P3 IADD track1I0.CC, track1I0, 2x<32>; + +02:-:-:-:1 STS [writeIs + 4x<7*128 + 64>], storeX7; +--:-:-:-:1 STS [writeIs + 4x<6*128 + 64>], storeX6; +--:-:-:-:1 STS [writeIs + 4x<5*128 + 64>], storeX5; +--:-:-:-:1 STS [writeIs + 4x<4*128 + 64>], storeX4; +10:-:-:-:1 STS [writeIs + 4x<3*128 + 64>], storeX3; +--:-:-:-:1 STS [writeIs + 4x<2*128 + 64>], storeX2; +--:-:-:-:1 STS [writeIs + 4x<1*128 + 64>], storeX1; +--:1:-:-:1 STS [writeIs + 4x<0*128 + 64>], storeX0; + +--:-:-:-:1 PSETP.AND.AND P6, PT, P6, P1, PT; +--:-:-:-:0 @P3 IADD.X track1I1, track1I1, RZ; + +25:-:-:-:1 [+ convert() +] storeX7, loadE3.H1; +--:-:-:-:1 [+ convert() +] storeX6, loadE3.H0; +--:-:-:-:1 [+ convert() +] storeX5, loadE2.H1; +--:-:3:-:1 [+ convert() +] storeX4, loadE2.H0; +--:-:-:-:1 [+ convert() +] storeX3, loadE1.H1; +--:-:-:-:1 [+ convert() +] storeX2, loadE1.H0; +--:-:-:-:1 [+ convert() +] storeX1, loadE0.H1; +--:-:6:-:1 [+ convert() +] storeX0, loadE0.H0; + +--:-:-:-:0 @P4 IADD trackE0.CC, trackE0, 2x<32>; + +04:-:-:-:1 STS [writeEs + 4x<7*64>], storeX7; +--:-:-:-:1 STS [writeEs + 4x<6*64>], storeX6; +--:-:-:-:1 STS [writeEs + 4x<5*64>], storeX5; +--:-:-:-:1 STS [writeEs + 4x<4*64>], storeX4; +20:-:-:-:1 STS [writeEs + 4x<3*64>], storeX3; +--:-:-:-:1 STS [writeEs + 4x<2*64>], storeX2; +--:-:-:-:1 STS [writeEs + 4x<1*64>], storeX1; +--:1:-:-:1 STS [writeEs + 4x<0*64>], storeX0; + +--:-:-:-:1 @P4 IADD.X trackE1, trackE1, RZ; + +--:-:-:-:1 IADD readEs, readEs, -swapBuf; +--:-:-:-:0 IADD readIs, readIs, -swapBuf; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:0 @P5 IADD q, q, param_grid_Q; +--:-:-:Y:5 @P5 BRA.U NEXT_PQ; +--:-:-:-:0 @P6 IADD p, p, param_grid_P; +--:-:-:Y:5 @P6 BRA.U NEXT_PQ; + +--:-:-:-:2 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:Y:5 BRA.U INIT_LOOP; + + +FINISH: + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID2; +--:-:2:-:1 S2R blkI, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.X; +--:-:-:-:5 BRA.U END_CTAID2; +CTAID2: +--:-:2:-:1 S2R blkI, SR_CTAID.X; +--:-:3:-:1 S2R blkE, SR_CTAID.Y; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.Z; +END_CTAID2: + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readEs, readEs, -4x; +--:-:-:-:1 @P0 IADD readIs, readIs, -swapBuf; +--:-:-:-:1 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 64 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 4; + + +// readCs = ((tid & 96) << 3) | (tid & 31) +01:-:-:-:1 LOP.AND tid31, tid, 31; +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + + +// kk = blkE*64 + tid31; +04:-:-:-:1 ISCADD kk, blkE, tid31, 6; +--:-:-:-:1 IADD kk, kk, param_offset_K; + +// crst = blkI*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 crst00, tid96, 1; +02:-:-:-:1 ISCADD crst00, blkI, crst00, 7; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K4, K, 4; +--:-:-:-:1 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +--:-:-:-:1 VMAD.U16.U16 tf, crst00, K, kk; +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSTK, param_CRSTK; +08:-:-:-:1 XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ; + }; + } + return ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 2; + +--:-:-:-:1 MOV alpha, param_alpha; + +// kk < K +--:-:-:-:1 ISETP.LT.AND P5, PT, kk, param_K, PT; +--:-:-:-:1 IADD kk, kk, 32; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, PT; + + + +--:-:-:-:6 IADD track04F0.CC, track00F0, K4; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:6 IADD track08F0.CC, track04F0, K4; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:6 IADD track12F0.CC, track08F0, K4; +--:-:-:-:1 IADD.X track12F1, track08F1, RZ; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD track00F0.CC, track00F0, K60;\n" . + "--:-:-:-:1 IADD crst00, crst00, 60;\n" . + "--:-:-:-:1 IADD.X track00F1, track00F1, RZ;\n" . + "--:-:-:-:5 IADD track04F0.CC, track04F0, K60;\n" . + "--:-:-:-:1 IADD crst04, crst04, 60;\n" . + "--:-:-:-:1 IADD.X track04F1, track04F1, RZ;\n" . + "--:-:-:-:5 IADD track08F0.CC, track08F0, K60;\n" . + "--:-:-:-:1 IADD crst08, crst08, 60;\n" . + "--:-:-:-:1 IADD.X track08F1, track08F1, RZ;\n" . + "--:-:-:-:5 IADD track12F0.CC, track12F0, K60;\n" . + "--:-:-:-:1 IADD crst12, crst12, 60;\n" . + "--:-:-:-:1 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL f0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL f1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL f2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL f3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL f4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL f5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL f6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL f7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +--:-:-:-:0 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], f0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], f4; + +--:-:1:-:1 LDS f0, [readCs + 4x<0*64 + 00>]; +--:-:2:-:1 LDS f2, [readCs + 4x<1*64 + 00>]; +--:-:3:-:1 LDS f4, [readCs + 4x<2*64 + 00>]; +--:-:4:-:1 LDS f6, [readCs + 4x<3*64 + 00>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:-:-:-:1 @P0 STG.E.CG [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 STG.E.CG [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 STG.E.CG [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 STG.E.CG [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } + else + { + return q{ +01:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } ++] + +--:-:1:-:1 LDS f1, [readCs + 4x<0*64 + 32>]; +--:-:2:-:1 LDS f3, [readCs + 4x<1*64 + 32>]; +--:-:3:-:1 LDS f5, [readCs + 4x<2*64 + 32>]; +--:-:4:-:1 LDS f7, [readCs + 4x<3*64 + 32>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:1:-:-:1 @P0 STG.E.CG [track00F + 4x<32>], f1; +02:2:-:-:1 @P1 STG.E.CG [track04F + 4x<32>], f3; +04:3:-:-:1 @P2 STG.E.CG [track08F + 4x<32>], f5; +08:4:-:-:1 @P3 STG.E.CG [track12F + 4x<32>], f7; + }; + } + else + { + return q{ +01:1:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1; +02:2:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3; +04:3:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5; +08:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7; + }; + } ++] + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X128_N128.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X128_N128.sass new file mode 100644 index 0000000..71bae4b --- /dev/null +++ b/Kernel/Convolution/Maxwell/hconv_xprop_X128_N128.sass @@ -0,0 +1,261 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 128; + our $shareF = 128; + our $stepI = 32; + our $stepF = 64; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + szShareF : (128*8) + szShareI : (128*8) + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<128*8*2 + 128*8*2 + 4> + addr_m : 4x<128*8*2 + 128*8*2 + 4> + addr_p : 4x<128*8*2 + 128*8*2 + 5> + addr_q : 4x<128*8*2 + 128*8*2 + 6> + addr_k : 4x<128*8*2 + 128*8*2 + 7> + addr_szLut : 4x<128*8*2 + 128*8*2 + 8> + addr_lut : 4x<128*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne + 72-111 ~ tid1, tid128, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + + 100-103 : loadI<0-1>, loadF<0-1> + 104-107 : storeI<0-3> + 104-107 : storeF<0-3> + + 108-111 ~ offsetF, offsetI, offsetFc, offsetIc + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 114-122 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset + 123-127 ~ readFs, readIs, tid, idx_N + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-122 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidX = (tid & 31) << 2 +// tidY = tid >> 5 +--:-:-:-:1 LOP.AND tidX, tid, 31; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 5; + +// trackF += blkF*128 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 7; + +// trackI += blkI*128 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 7; + +// writeS = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 SHL writeS, writeS, 2; + +// readFs = ((tid & 112) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 112; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7) +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 SHR.U32 tid128, tid128, 3; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid128; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.64 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.64 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.64 loadI, [addr_zero]; + +12:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:2:-:2 [+ convert() +] storeF0, loadF0.H0; + +02:1:-:-:2 STS.128 [writeS], storeF; + +25:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:2:-:2 [+ convert() +] storeI0, loadI0.H0; + +02:1:-:-:1 STS.128 [writeS + 4x], storeI; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.64 loadI, [trackI]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c33 => "02:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c41 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c45 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c60 => "02:2:-:-:1 \@P0 STS.128 [writeS], storeF;\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "20:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c40 => "02:-:2:-:1 \@P1 LDG.E.CI.64 loadF, [trackF];\n", + + + j5c45 => "04:-:-:-:1 \@P0 $convert storeI3, loadI1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert storeI2, loadI1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert storeI1, loadI0.H1;\n", + j5c57 => "--:-:3:-:1 \@P0 $convert storeI0, loadI0.H0;\n", + + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x], storeI;\n", + + j6c54 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j6c59 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j6c61 => "04:-:3:-:1 \@P1 LDG.E.64 loadI, [trackI];\n", + + j6c62 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; + + + +// tidOX = (tid & 7) << 3 + (tid & 128) >> 1 +// tidOY = (tid & 127) >> 3 +--:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 3; +--:-:-:-:1 LOP.AND tidOX2, tid, 128; +--:-:-:-:1 SHR.U32 tidOX2, tidOX2, 1; +--:-:-:-:1 LOP.OR tidOX, tidOX, tidOX2; +--:-:-:-:1 LOP.AND tidOY, tid, 127; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 LOP.AND readIs, readIs, 0x1ff; +--:-:-:-:1 LOP.AND readFs, readFs, 0x0ff; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +--:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*128 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 7; + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X128_N64.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X128_N64.sass new file mode 100644 index 0000000..ce64717 --- /dev/null +++ b/Kernel/Convolution/Maxwell/hconv_xprop_X128_N64.sass @@ -0,0 +1,284 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 64; + our $shareF = 128; + our $stepI = 32; + our $stepF = 64; + our $remapF = 1; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + szShareF : (128*8) + szShareI : (64*8) + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 128*8*2 + 4> + addr_m : 4x<64*8*2 + 128*8*2 + 4> + addr_p : 4x<64*8*2 + 128*8*2 + 5> + addr_q : 4x<64*8*2 + 128*8*2 + 6> + addr_k : 4x<64*8*2 + 128*8*2 + 7> + addr_szLut : 4x<64*8*2 + 128*8*2 + 8> + addr_lut : 4x<64*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-109 ~ tid1, tid15, tidFX, tidIX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-109 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + + 100-103 : loadF<0-3> + 100-103 : storeF<0-3> + 104-107 : storeF<4-7> + + 108-109 : loadI<0-1> + 104-107 : storeI<0-3> + + 104-107 ~ offsetF + + 110-111 : sliceI, sliceF + 110-111 : sliceIF<0-1> + + 112-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc + 125-127 ~ readFs, readIs, swapBuf + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-124 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidFX = (tid & 15) << 3 +// tidIX = (tid & 15) << 2 +// tidY = tid >> 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidFX, tid15, 3; +--:-:-:-:1 SHL tidIX, tid15, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 4; + +// trackF += blkF*128 + tidFX + offset_K +--:-:-:-:1 ISCADD offsetFk, idx_K, tidFX, 7; + +// trackI += blkI*64 + tidIX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidIX, 6; + +// Remap the FX dim to avoid bank conflicts when storing to shared + +// writeFs = (128*tidY + tidIX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidIX, 7; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (64*tidY + tidIX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidIX, 6; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = ((tid & -16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = (tid >> 1) & 7 +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.128 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.64 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.64 loadI, [addr_zero]; + +12:-:-:-:1 [+ convert() +] storeF7, loadF3.H1; +--:-:-:-:1 [+ convert() +] storeF6, loadF3.H0; +--:-:-:-:1 [+ convert() +] storeF5, loadF2.H1; +--:-:1:-:1 [+ convert() +] storeF4, loadF2.H0; +--:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:2:-:1 [+ convert() +] storeF0, loadF0.H0; + +01:-:-:-:1 STS.128 [writeFs + 4x<64>], storeF4; +02:1:-:-:2 STS.128 [writeFs + 4x<00>], storeF0; + +25:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:3:-:2 [+ convert() +] storeI0, loadI0.H0; + +04:1:-:-:1 STS.128 [writeIs], storeI0; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.64 loadI, [trackI]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c29 => "02:-:-:-:1 \@P0 $convert storeF7, loadF3.H1;\n", + j1c33 => "--:-:-:-:1 \@P0 $convert storeF6, loadF3.H0;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF5, loadF2.H1;\n", + j1c41 => "--:-:5:-:1 \@P0 $convert storeF4, loadF2.H0;\n", + j1c45 => "--:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c49 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c53 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c57 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c59 => "10:5:-:-:1 \@P0 STS.128 [writeFs + 4x<64>], storeF4;\n", + j2c8 => "02:2:-:-:1 \@P0 STS.128 [writeFs + 4x<00>], storeF0;\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "30:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c40 => "02:-:2:-:1 \@P1 LDG.E.CI.128 loadF, [trackF];\n", + + j5c45 => "04:-:-:-:1 \@P0 $convert storeI3, loadI1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert storeI2, loadI1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert storeI1, loadI0.H1;\n", + j5c57 => "--:-:3:-:1 \@P0 $convert storeI0, loadI0.H0;\n", + + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeIs], storeI0;\n", + + j6c55 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j6c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j6c62 => "04:-:3:-:1 \@P1 LDG.E.64 loadI, [trackI];\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 3 +// tidOY = tid >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 3; +--:-:-:-:1 SHR.U32 tidOY, tid, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 64 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 6; + +// Mul by 4 here expands k stride back out +// Mul by 2 again to undo the bank conflict avoiding stride +// k = blkF*128 + tidOY * 8 +--:-:-:-:1 SHL tidOY, tidOY, 3; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 7; + +[+ output_setup(63, 0, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X32_N128.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X32_N128.sass new file mode 100644 index 0000000..e85f7d4 --- /dev/null +++ b/Kernel/Convolution/Maxwell/hconv_xprop_X32_N128.sass @@ -0,0 +1,323 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 128; + our $shareF = 32; + our $stepI = 32; + our $stepF = 16; + our $remapI = 1; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + + addr_zero : 4x<32*8*2 + 128*8*2 + 0> + szShareF : (32*8) + szShareI : (128*8) + + addr_zero : 4x<32*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<32*8*2 + 128*8*2 + 4> + addr_m : 4x<32*8*2 + 128*8*2 + 4> + addr_p : 4x<32*8*2 + 128*8*2 + 5> + addr_q : 4x<32*8*2 + 128*8*2 + 6> + addr_k : 4x<32*8*2 + 128*8*2 + 7> + addr_szLut : 4x<32*8*2 + 128*8*2 + 8> + addr_lut : 4x<32*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-69 : m, p, q + 64-69 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne + 70-113 ~ tid1, tidIX, tidFX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 70-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + + 100-103 : load0I<0-3> + 100-103 : store0I<0-3> + 104-107 : store0I<4-7> + + 108-111 : load1I<0-3> + 108-111 : store1I<0-3> + 104-107 : store1I<4-7> + + 112-113 : loadF<0-1> + 104-107 : storeF<0-3> + + 114-115 : sliceI, sliceF + 114-115 : sliceIF<0-1> + + 116-140 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetF, offsetIc, offsetFc + 141-155 ~ readFs, readIs, swapBuf, tid, idx_N, tid7, tid1_7, tid32, tid32_1 + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-140 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] + +[+ get_mpqk() +] + +// tidIX = (tid & 7) << 3 +// tidFX = (tid & 7) << 2 + +// tidY = tid >> 3 +--:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHL tidIX, tid7, 3; +--:-:-:-:1 SHL tidFX, tid7, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 3; + +// trackF += blkF*32 + tidFX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidFX, 5; + +// trackI += blkI*128 + tidIX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidIX, 7; + +// writeFs = (32*tidY + tidFX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidFX, 5; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// Remap the IX dim to avoid bank conflicts when storing to shared + +// writeIs = (128*tidY + tidFX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidFX, 7; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = (((tid & 16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 32) >> 1) | ((tid >> 1) & 7) << 4 +--:-:-:-:1 LOP.AND tid32, tid, 32; +--:-:-:-:1 SHR.U32 tid32_1, tid32, 1; +--:-:-:-:1 BFE.U32 tid1_7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, tid1_7, tid32_1; +--:-:-:-:1 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.64 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 load0I, [trackI + 2x<00>]; +--:-:4:-:1 @P1 LDG.E.128 load1I, [trackI + 2x<64>]; +--:-:-:-:1 @!P1 LDS.U.128 load0I, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 load1I, [addr_zero]; + +12:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:2:-:2 [+ convert() +] storeF0, loadF0.H0; + +02:1:-:-:2 STS.128 [writeFs], storeF0; + +25:-:-:-:1 [+ convert() +] store0I7, load0I3.H1; +--:-:-:-:1 [+ convert() +] store0I6, load0I3.H0; +--:-:-:-:1 [+ convert() +] store0I5, load0I2.H1; +--:-:2:-:1 [+ convert() +] store0I4, load0I2.H0; +--:-:-:-:1 [+ convert() +] store0I3, load0I1.H1; +--:-:-:-:1 [+ convert() +] store0I2, load0I1.H0; +--:-:-:-:1 [+ convert() +] store0I1, load0I0.H1; +--:-:3:-:1 [+ convert() +] store0I0, load0I0.H0; + +02:-:-:-:1 STS.128 [writeIs + 4x<32>], store0I4; +04:1:-:-:2 STS.128 [writeIs + 4x<00>], store0I0; + +09:-:-:-:1 [+ convert() +] store1I7, load1I3.H1; +--:-:-:-:1 [+ convert() +] store1I6, load1I3.H0; +--:-:-:-:1 [+ convert() +] store1I5, load1I2.H1; +--:-:2:-:1 [+ convert() +] store1I4, load1I2.H0; +--:-:-:-:1 [+ convert() +] store1I3, load1I1.H1; +--:-:-:-:1 [+ convert() +] store1I2, load1I1.H0; +--:-:-:-:1 [+ convert() +] store1I1, load1I0.H1; +--:-:3:-:1 [+ convert() +] store1I0, load1I0.H0; + +02:-:-:-:1 STS.128 [writeIs + 4x<96>], store1I4; +04:1:-:-:1 STS.128 [writeIs + 4x<64>], store1I0; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.128 load0I, [trackI + 2x<00>]; +--:5:4:-:1 @P1 LDG.E.128 load1I, [trackI + 2x<64>]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c33 => "02:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c41 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c45 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c60 => "02:-:-:-:1 \@P0 STS.128 [writeFs], storeF0;\n", + + j1c62 => "--:-:2:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "02:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.64 loadF0, [trackF];\n", + + + j3c29 => "04:-:-:-:1 \@P0 $convert store0I7, load0I3.H1;\n", + j3c33 => "--:-:-:-:1 \@P0 $convert store0I6, load0I3.H0;\n", + j3c37 => "--:-:-:-:1 \@P0 $convert store0I5, load0I2.H1;\n", + j3c41 => "--:-:6:-:1 \@P0 $convert store0I4, load0I2.H0;\n", + j3c45 => "--:-:-:-:1 \@P0 $convert store0I3, load0I1.H1;\n", + j3c49 => "--:-:-:-:1 \@P0 $convert store0I2, load0I1.H0;\n", + j3c53 => "--:-:-:-:1 \@P0 $convert store0I1, load0I0.H1;\n", + j3c57 => "--:-:3:-:1 \@P0 $convert store0I0, load0I0.H0;\n", + + j3c59 => "20:-:-:-:1 \@P0 STS.128 [writeIs + 4x<32>], store0I4;\n", + j4c8 => "04:3:-:-:1 \@P0 STS.128 [writeIs + 4x<00>], store0I0;\n", + + j4c50 => "10:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j4c55 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j4c61 => "04:-:3:-:1 \@P1 LDG.E.128 load0I0, [trackI + 2x<00>];\n", + + + j5c29 => "08:-:-:-:1 \@P0 $convert store1I7, load1I3.H1;\n", + j5c33 => "--:-:-:-:1 \@P0 $convert store1I6, load1I3.H0;\n", + j5c37 => "--:-:-:-:1 \@P0 $convert store1I5, load1I2.H1;\n", + j5c41 => "--:-:6:-:1 \@P0 $convert store1I4, load1I2.H0;\n", + j5c45 => "--:-:-:-:1 \@P0 $convert store1I3, load1I1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert store1I2, load1I1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert store1I1, load1I0.H1;\n", + j5c57 => "--:-:4:-:1 \@P0 $convert store1I0, load1I0.H0;\n", + + j5c59 => "20:-:-:-:1 \@P0 STS.128 [writeIs + 4x<96>], store1I4;\n", + j6c8 => "08:4:-:-:1 \@P0 STS.128 [writeIs + 4x<64>], store1I0;\n", + + j6c61 => "08:5:4:-:1 \@P1 LDG.E.128 load1I0, [trackI + 2x<64>];\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; + + + +// tidOX = (tid & 7) << 3 + (tid & 32) << 1 +// tidOY = (tid & 31) >> 3 +--:-:-:-:1 SHL tid32, tid32, 1; +--:-:-:-:1 ISCADD tidOX, tid7, tid32, 3; +--:-:-:-:1 LOP.AND tidOY, tid, 31; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; + +// readIs = ((tid & 32) >> 1) | (((tid >> 1) & 7) << 1) << 4 +--:-:-:-:1 ISCADD readIs, tid1_7, tid32_1, 1; +--:-:-:-:1 SHL readIs, readIs, 4; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +--:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*32 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +--:-:-:-:1 ISCADD k, idx_K, tidOY, 5; + + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X64_N128.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X64_N128.sass new file mode 100644 index 0000000..38f8183 --- /dev/null +++ b/Kernel/Convolution/Maxwell/hconv_xprop_X64_N128.sass @@ -0,0 +1,293 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 128; + our $shareF = 64; + our $stepI = 64; + our $stepF = 32; + our $remapI = 1; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + szShareF : (64*8) + szShareI : (128*8) + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 128*8*2 + 4> + addr_m : 4x<64*8*2 + 128*8*2 + 4> + addr_p : 4x<64*8*2 + 128*8*2 + 5> + addr_q : 4x<64*8*2 + 128*8*2 + 6> + addr_k : 4x<64*8*2 + 128*8*2 + 7> + addr_szLut : 4x<64*8*2 + 128*8*2 + 8> + addr_lut : 4x<64*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-111 ~ tid1, tid15, tid64, tidIX, tidFX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + + 100-103 : loadI<0-3> + 100-103 : storeI<0-3> + 104-107 : storeI<4-7> + + 108-109 : loadF<0-1> + 104-107 : storeF<0-3> + + 110-111 : sliceI, sliceF + 110-111 : sliceIF<0-1> + + 108-109 ~ offsetF + + 112-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc + 125-127 ~ readFs, readIs, swapBuf + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-124 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidIX = (tid & 15) << 3 +// tidFX = (tid & 15) << 2 +// tidY = tid >> 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidIX, tid15, 3; +--:-:-:-:1 SHL tidFX, tid15, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 4; + +// trackF += blkF*64 + tidFX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidFX, 6; + +// trackI += blkI*128 + tidIX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidIX, 7; + +// writeFs = (64*tidY + tidFX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidFX, 6; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// Remap the IX dim to avoid bank conflicts when storing to shared + +// writeIs = (128*tidY + tidFX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidFX, 7; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = ((tid & 48) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 48; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 64) >> 3) | ((tid >> 1) & 7) +--:-:-:-:1 LOP.AND tid64, tid, 64; +--:-:-:-:1 SHR.U32 tid64, tid64, 3; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid64; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.64 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero]; + +12:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:2:-:2 [+ convert() +] storeF0, loadF0.H0; + +02:1:-:-:2 STS.128 [writeFs], storeF0; + +25:-:-:-:1 [+ convert() +] storeI7, loadI3.H1; +--:-:-:-:1 [+ convert() +] storeI6, loadI3.H0; +--:-:-:-:1 [+ convert() +] storeI5, loadI2.H1; +--:-:2:-:1 [+ convert() +] storeI4, loadI2.H0; +--:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:3:-:1 [+ convert() +] storeI0, loadI0.H0; + +02:-:-:-:1 STS.128 [writeIs + 4x<64>], storeI4; +04:1:-:-:1 STS.128 [writeIs + 4x<00>], storeI0; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.128 loadI, [trackI]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c33 => "02:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c41 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c45 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c60 => "02:2:-:-:1 \@P0 STS.128 [writeFs], storeF0;\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "22:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.64 loadF, [trackF];\n", + + + j5c29 => "04:-:-:-:1 \@P0 $convert storeI7, loadI3.H1;\n", + j5c33 => "--:-:-:-:1 \@P0 $convert storeI6, loadI3.H0;\n", + j5c37 => "--:-:-:-:1 \@P0 $convert storeI5, loadI2.H1;\n", + j5c41 => "--:-:6:-:1 \@P0 $convert storeI4, loadI2.H0;\n", + j5c45 => "--:-:-:-:1 \@P0 $convert storeI3, loadI1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert storeI2, loadI1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert storeI1, loadI0.H1;\n", + j5c57 => "--:-:3:-:1 \@P0 $convert storeI0, loadI0.H0;\n", + + j5c59 => "20:-:-:-:1 \@P0 STS.128 [writeIs + 4x<64>], storeI4;\n", + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeIs + 4x<00>], storeI0;\n", + + j6c55 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j6c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j6c62 => "04:-:3:-:1 \@P1 LDG.E.128 loadI, [trackI];\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 3 + (tid & 64) +// tidOY = (tid & 63) >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 3; +--:-:-:-:1 LOP.AND tidOX2, tid, 64; +--:-:-:-:1 LOP.OR tidOX, tidOX, tidOX2; +--:-:-:-:1 LOP.AND tidOY, tid, 63; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Expand back out to undo our bank conflict avoiding stride +--:-:-:-:1 SHL readIs, readIs, 1; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*64 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 6; + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X64_N64.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X64_N64.sass new file mode 100644 index 0000000..16b92c5 --- /dev/null +++ b/Kernel/Convolution/Maxwell/hconv_xprop_X64_N64.sass @@ -0,0 +1,290 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 64; + our $shareF = 64; + our $stepI = 32; + our $stepF = 32; + our $remapF = 1; + our $remapI = 1; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} + +-] + + + + + addr_zero : 4x<64*8*2 + 64*8*2 + 0> + szShareF : (64*8) + szShareI : (64*8) + + addr_zero : 4x<64*8*2 + 64*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 64*8*2 + 4> + addr_m : 4x<64*8*2 + 64*8*2 + 4> + addr_p : 4x<64*8*2 + 64*8*2 + 5> + addr_q : 4x<64*8*2 + 64*8*2 + 6> + addr_k : 4x<64*8*2 + 64*8*2 + 7> + addr_szLut : 4x<64*8*2 + 64*8*2 + 8> + addr_lut : 4x<64*8*2 + 64*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-111 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-103 : loadI<0-3> + 100-103 : storeI<0-3> + 104-107 : storeI<4-7> + + 108-111 : loadF<0-3> + 108-111 : storeF<0-3> + 104-107 : storeF<4-7> + + 104-107 ~ offsetF + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 114-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc + 126-127 ~ readFs, readIs + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-125 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] + +[+ get_mpqk() +] + +// tidX = (tid & 7) << 3 +// tidY = tid >> 3 +--:-:-:-:1 LOP.AND tidX, tid, 7; +--:-:-:-:1 SHL tidX, tidX, 3; +--:-:-:-:1 SHR.U32 tidY, tid, 3; + +// trackF += blkF*64 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 6; + +// trackI += blkI*64 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 6; + +// Remap the X dim to avoid bank conflicts when storing to shared +// We can unmap this in the output +--:-:-:-:1 SHR.U32 tidX, tidX, 1; + +// writeS = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 6; +--:-:-:-:1 SHL writeS, writeS, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:0 SHL readFs, readFs, 4; + +// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:1:-:1 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:5:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; + +--:-:2:-:1 @P1 LDG.E.128 loadI0, [trackI]; +--:-:6:-:1 @!P1 LDS.U.128 loadI0, [addr_zero]; + +11:-:-:-:1 [+ convert() +] storeF7, loadF3.H1; +--:-:-:-:1 [+ convert() +] storeF6, loadF3.H0; +--:-:-:-:1 [+ convert() +] storeF5, loadF2.H1; +--:-:1:-:1 [+ convert() +] storeF4, loadF2.H0; +--:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:5:-:1 [+ convert() +] storeF0, loadF0.H0; + +01:1:-:-:1 STS.128 [writeS + 4x<0*64 + 32>], storeF4; +10:-:-:-:1 STS.128 [writeS + 4x<0*64 + 0>], storeF0; + +23:-:-:-:1 [+ convert() +] storeI7, loadI3.H1; +--:-:-:-:1 [+ convert() +] storeI6, loadI3.H0; +--:-:-:-:1 [+ convert() +] storeI5, loadI2.H1; +--:-:1:-:1 [+ convert() +] storeI4, loadI2.H0; +--:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:5:-:1 [+ convert() +] storeI0, loadI0.H0; + +01:-:-:-:1 STS.128 [writeS + 4x<8*64 + 32>], storeI4; +10:1:-:-:1 STS.128 [writeS + 4x<8*64 + 0>], storeI0; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>]; +--:-:3:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x< 0>]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c20 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j1c25 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j1c31 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j1c32 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j1c18 => "02:-:-:-:1 \@P0 $convert storeF7, loadF3.H1;\n", + j1c22 => "--:-:-:-:1 \@P0 $convert storeF6, loadF3.H0;\n", + j1c26 => "--:-:-:-:1 \@P0 $convert storeF5, loadF2.H1;\n", + j1c30 => "--:-:5:-:1 \@P0 $convert storeF4, loadF2.H0;\n", + j1c33 => "--:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c41 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c45 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c47 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<0*64 + 32>], storeF4;\n", + j1c62 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*64 + 0>], storeF0;\n", + + j2c19 => "30:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c24 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c26 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c28 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c30 => "02:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF];\n", + + j5c29 => "04:-:-:-:1 \@P0 $convert storeI7, loadI3.H1;\n", + j5c33 => "--:-:-:-:1 \@P0 $convert storeI6, loadI3.H0;\n", + j5c37 => "--:-:-:-:1 \@P0 $convert storeI5, loadI2.H1;\n", + j5c41 => "--:-:5:-:1 \@P0 $convert storeI4, loadI2.H0;\n", + j5c45 => "--:-:-:-:1 \@P0 $convert storeI3, loadI1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert storeI2, loadI1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert storeI1, loadI0.H1;\n", + j5c57 => "--:-:3:-:1 \@P0 $convert storeI0, loadI0.H0;\n", + + j5c59 => "10:-:-:-:1 \@P0 STS.128 [writeS + 4x<8*64 + 32>], storeI4;\n", + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<8*64 + 0>], storeI0;\n", + + j6c50 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j6c55 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j6c61 => "04:-:3:-:1 \@P1 LDG.E.128 loadI0, [trackI];\n", + + j6c62 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 3 +// tidOY = tid >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 3; +--:-:-:-:1 SHR.U32 tidOY, tid, 3; + +--:-:-:-:1 LOP.AND readIs, readIs, 0x7ff; +--:-:-:-:1 LOP.AND readFs, readFs, 0x7ff; + +// Expand back out to undo our bank conflict avoiding stride +--:-:-:-:1 SHL readIs, readIs, 1; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 64 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 6; + +// Mul by 4 here expands k stride back out +// Mul by 2 again to undo the bank conflict avoiding stride +// k = blkF*64 + tidOY * 8 +--:-:-:-:1 SHL tidOY, tidOY, 3; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 6; + +[+ output_setup(63, 0, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Maxwell/persistent_rnn_bprop.sass b/Kernel/Convolution/Maxwell/persistent_rnn_bprop.sass new file mode 100644 index 0000000..ddddb22 --- /dev/null +++ b/Kernel/Convolution/Maxwell/persistent_rnn_bprop.sass @@ -0,0 +1,638 @@ +# Kernel: presistent_birnn + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(64*48)> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_d[0] : c[0x0][0x140] + param_d[1] : c[0x0][0x144] + param_dnext[0] : c[0x0][0x148] + param_dnext[1] : c[0x0][0x14c] + param_h[0] : c[0x0][0x150] + param_h[1] : c[0x0][0x154] + param_w[0] : c[0x0][0x158] + param_w[1] : c[0x0][0x15c] + param_lockAddr[0] : c[0x0][0x160] + param_lockAddr[1] : c[0x0][0x164] + param_ldd : c[0x0][0x168] + param_ldh : c[0x0][0x16c] + param_ldw : c[0x0][0x170] + param_bsz : c[0x0][0x174] + param_seqLength : c[0x0][0x178] + param_numBlks : c[0x0][0x17c] + param_rowSize : c[0x0][0x180] + param_reverse : c[0x0][0x184] + param_reluclip : c[0x0][0x188] + + + + + 0-215 : weight<000-215> + 216-227 : accum<00-11> + 228-231 : timeStep, warpTid, rowOffset, tid + + 232-235 : wAddr<0-1>, biasAddr<0-1> + 236-254 ~ bid, ldw, wRow, loadRow, tidLsbs, tidMsbs, warpIndex, storeWeights, loadWeights, outRow, rowSize + + 232-249 : loadBuffer<0-3>, delta0r<0-3>, delta1r<0-3>, delta2r<0-3>, dnextAddr<0-1> + 250-254 ~ loadDeltas, storeDeltas, loadIndex, dOffset, ldd + + 236-247 : peerR0V<0-3>, peerR1V<0-3>, peerR2V<0-3> + 244 : hOffset + 248-253 : h<0-3>, hAddr<0-1> + + 232-241 : output<0-3>, dAddr<0-1>, lockAddr<0-1>, expectVal, setVal + 241-245 ~ storeIndex, hRow, predSave, lockVal, reluclip + + + +//Get tid/block id +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R bid, SR_CTAID.X; + +//Store zeros at addr_zero +--:-:-:-:1 STS.128 [addr_zero], RZ; + + +--:-:-:-:1 MOV ldw, param_ldw; +--:-:-:-:1 MOV rowSize, param_rowSize; + +//timeStep = (param_reverse == 0) ? 0 : param_seqLength +--:-:-:-:1 ISETP.EQ.AND P2, PT, RZ, param_reverse, PT; +--:-:-:-:1 SEL timeStep, RZ, param_seqLength, P2; +--:-:-:-:1 @!P2 IADD timeStep, timeStep, -1; + +//warpIndex = threadIdx.x >> 5 +01:-:-:-:1 SHR.U32 warpIndex, tid, 5; + +//warpTid = threadIdx.x & 0x1f +01:-:-:-:1 LOP.AND warpTid, tid, 0x1f; + +//rowOffset = ((blockIdx.x << 3) + warp_index) * 6 +02:-:-:-:1 SHL rowOffset, bid, 3; +--:-:-:-:1 IADD rowOffset, rowOffset, warpIndex; +--:-:-:-:1 XMAD rowOffset, rowOffset, 6, RZ; + +//if(warp_tid > 15) rowOffset += 3 +--:-:-:-:1 ISETP.GT.AND P1, PT, warpTid, 15, PT; +--:-:-:-:1 @P1 IADD rowOffset, rowOffset, 3; + +//warpTid = warpTid & 0x0f +--:-:-:-:1 LOP.AND warpTid, warpTid, 0x0f; +--:-:-:-:1 ISETP.LT.AND P0, PT, warpTid, 3, PT; +--:-:-:-:1 IADD outRow, rowOffset, warpTid; +--:-:-:-:1 ISETP.LT.AND P0, PT, outRow, param_rowSize, P0; + +//storeWeights = (((tid >> 2) * 48) + ((tid & 3) << 2)) << 2 +//wRow = ((tid >> 2) * ldw) + ((tid & 3) << 2) + (bid * 48) +--:-:-:-:1 LOP.AND tidLsbs, warpTid, 0x03; +--:-:-:-:1 SHR tidMsbs, tid, 2; +--:-:-:-:1 SHL tidLsbs, tidLsbs, 2; + +--:-:-:-:1 XMAD loadRow, bid, 48, tidLsbs; +--:-:-:-:1 XMAD wRow, tidMsbs, ldw, loadRow; + +--:-:-:-:1 XMAD storeWeights, tidMsbs, 48, tidLsbs; +--:-:-:-:1 SHL storeWeights, storeWeights, 2; + +//loadWeights = (((warpTid * 8) + warpIndex) * 6) + (P1 ? 3 : 0)) << 2 +--:-:-:-:1 XMAD loadWeights, warpTid, 8, warpIndex; +--:-:-:-:1 XMAD loadWeights, loadWeights, 6, RZ; +--:-:-:-:1 @P1 IADD loadWeights, loadWeights, 3; +--:-:-:-:1 SHL loadWeights, loadWeights, 2; + +//wAddr = &w[wRow] +--:-:-:-:1 LEA wAddr0.CC, wRow, param_w[0], 2; +--:-:-:-:1 LEA.HI.X wAddr1, wRow, param_w[1], RZ, 2; + +//ldw = ldw << 6 +--:-:-:-:1 SHL ldw, ldw, 8; + +//Compute row loading predicates +--:-:-:-:1 ISETP.LT.AND P1, PT, tidMsbs, rowSize, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, loadRow, rowSize, P1; +--:-:-:-:1 IADD rowSize, rowSize, -16; +--:-:-:-:1 ISETP.LT.AND P4, PT, loadRow, rowSize, P1; +--:-:-:-:1 IADD rowSize, rowSize, -16; +--:-:-:-:1 ISETP.LT.AND P5, PT, loadRow, rowSize, P1; + + +--:-:-:Y:c NOP; + +//Load weights to registers + + my $out; + my $regId = 0; + my $rowsize = 1152; + + for (my $col=0; $col < $rowsize; $col += 64) + { + $out .= "--:-:-:-:1 IADD tidMsbs, tidMsbs, 64;\n"; + + #Use vector loads from weight matrix + $regId = $col / 16; + $out .= sprintf "--:-:1:-:1 \@P3 LDG.E.128 weight%03d, [wAddr];\n", $regId; + $out .= sprintf "--:-:1:-:1 \@!P3 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + $regId = $col / 16 + 72; + $out .= sprintf "--:-:2:-:1 \@P4 LDG.E.128 weight%03d, [wAddr + 4x<16>];\n", $regId; + $out .= sprintf "--:-:2:-:1 \@!P4 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + $regId = $col / 16 + 144; + $out .= sprintf "--:-:3:-:1 \@P5 LDG.E.128 weight%03d, [wAddr + 4x<32>];\n", $regId; + $out .= sprintf "--:-:3:-:1 \@!P5 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + + $out .= "--:-:-:-:1 ISETP.LT.AND P3, PT, tidMsbs, param_rowSize, P3;\n"; + $out .= "--:-:-:-:1 ISETP.LT.AND P4, PT, tidMsbs, param_rowSize, P4;\n"; + $out .= "--:-:-:-:1 ISETP.LT.AND P5, PT, tidMsbs, param_rowSize, P5;\n"; + + #Store weights into shared memory + if ($col > 0) + { + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + } + + $regId = $col / 16; + $out .= sprintf "01:-:-:-:1 STS.U.128 [storeWeights], weight%03d;\n", $regId; + $regId = $col / 16 + 72; + $out .= sprintf "02:-:-:-:1 STS.U.128 [storeWeights + 4x<16>], weight%03d;\n", $regId; + $regId = $col / 16 + 144; + $out .= sprintf "04:-:-:-:1 STS.U.128 [storeWeights + 4x<32>], weight%03d;\n", $regId; + + $out .= "--:-:-:-:6 IADD wAddr0.CC, wAddr0, ldw;\n"; + $out .= "--:-:-:-:1 IADD.X wAddr1, wAddr1, RZ;\n\n"; + + #Load each weight from shared mem + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + + foreach my $row (0 .. 2) + { + foreach my $shared_col (0 .. 3) + { + my $control; + + if (($col + 64) >= $rowsize && $row == 2 && $shared_col == 3) + { + $control = "--:1:6:-:2"; + } + else + { + $control = "--:-:-:-:1"; + } + + $regId = ($row * 72) + ($col / 16) + $shared_col; + my $shared_offset = $row + ($shared_col * 16 * 48); + $out .= sprintf "%s LDS.U weight%03d, [loadWeights + 4x<%d>];\n", $control, $regId, $shared_offset; + } + } + } + + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + + return $out; + + + +//Predicates for store code +--:-:-:-:1 ISETP.EQ.AND P2, PT, warpTid, 0, PT; +--:-:-:-:1 ISETP.EQ.AND P3, PT, warpTid, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P4, PT, warpTid, 2, PT; + +UNROLLING_LOOP: + +//Prime inner product loop by loading first rows of dnext +--:-:-:-:1 MOV loadIndex, tid; + +//storeDeltas = tid << 4 +--:-:-:-:1 SHL storeDeltas, tid, 4; +--:-:-:-:1 SHL loadDeltas, warpTid, 4; + +//dnextAddr = &d_next[timeStep * ldd + loadIndex] +--:-:-:-:1 XMAD dOffset, loadIndex, param_ldd, timeStep; +--:-:-:-:1 LEA dnextAddr0.CC, dOffset, param_dnext[0], 4; +01:-:-:-:2 LEA.HI.X dnextAddr1, dOffset, param_dnext[1], RZ, 4; + +//loadBuffer = *dnextAddr +--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT; +--:5:1:-:2 @P1 LDG.E.CI.128 loadBuffer, [dnextAddr]; +--:5:1:-:2 @!P1 LDS.U.CI.128 loadBuffer, [addr_zero]; + +//ldd = param_ldd << 12 +--:-:-:-:1 MOV ldd, param_ldd; +--:-:-:-:1 SHL ldd, ldd, 12; + + +//Initialize all accumulation registers to 0 + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 accum%02d, [addr_zero];\n", $_ * 4), 0..2; + + +//Update load index and load address +--:-:-:-:6 IADD loadIndex, loadIndex, 256; +--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT; +10:-:-:-:6 IADD dnextAddr0.CC, dnextAddr0, ldd; +--:-:-:-:6 IADD.X dnextAddr1, dnextAddr1, RZ; + +01:-:-:-:1 STS.U.128 [storeDeltas], loadBuffer; + +//Unrolled GEMM loop + + our @top; + + my $out = join '', @top; + + my $rowsize = 1152; + my $weight_index = 0; + + my $wait_flag = 2; + my $set_flag = 4; + my $read_buffer = 0; + my $write_buffer = 2; + + for (my $k=0; $k < $rowsize; $k+=256) + { + if ($k == 0) + { + $out .= "--:6:1:-:1 \@P1 LDG.E.CI.128 loadBuffer, [dnextAddr];\n"; + $out .= "--:-:1:-:1 \@!P1 LDS.U.128 loadBuffer, [addr_zero];\n\n"; + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + $out .= "--:-:2:-:1 LDS.U.128 delta0r, [loadDeltas];\n"; + $out .= "--:-:3:-:1 LDS.U.128 delta1r, [loadDeltas + 4x<4*16>];\n\n"; + } + $out .= "--:-:-:-:1 LOP.XOR storeDeltas, storeDeltas, 4096;\n"; + + foreach my $shared_row (0 .. 15) + { + if($weight_index < 72) + { + if ($shared_row < 14 && ($k + (16 * ($shared_row + 2))) < $rowsize) + { + my $read_bar = "-"; + if ($shared_row == 13 && ($k + 256) < $rowsize) + { + $read_bar = "5"; + } + $out .= sprintf "--:%s:%d:-:1 LDS.U.128 delta%dr, [loadDeltas + 4x<4*%d>];\n", $read_bar, $set_flag, $write_buffer, (16 * ($shared_row + 2)); + } + + if ($shared_row == 11 && ($k + 512) < $rowsize) + { + $out .= "--:-:-:-:1 IADD loadIndex, loadIndex, 256;\n"; + $out .= "20:-:-:-:1 IADD dnextAddr0.CC, dnextAddr0, ldd;\n"; + } + + if ($shared_row == 12 && ($k + 512) < $rowsize) + { + $out .= "--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;\n"; + $out .= "--:-:-:-:1 IADD.X dnextAddr1, dnextAddr1, RZ;\n"; + } + + if ($shared_row == 13) + { + $out .= "01:-:-:-:1 STS.U.128 [storeDeltas], loadBuffer;\n"; + + if(($k + 512) < $rowsize) + { + $out .= "--:6:1:-:1 \@P1 LDG.E.CI.128 loadBuffer, [dnextAddr];\n"; + $out .= "--:-:1:-:1 \@!P1 LDS.U.128 loadBuffer, [addr_zero];\n\n"; + } + else + { + $out .= "--:-:-:-:6 IADD dOffset, rowOffset, warpTid;\n"; + $out .= "--:-:-:-:6 XMAD dOffset, dOffset, param_ldd, timeStep;\n"; + $out .= "--:-:-:-:6 LEA dnextAddr0.CC, dOffset, param_d[0], 4;\n"; + $out .= "--:-:-:-:2 LEA.HI.X dnextAddr1, dOffset, param_d[1], RZ, 4;\n"; + $out .= "--:-:6:-:1 \@P0 LDG.E.CI.128 loadBuffer, [dnextAddr];\n\n"; + } + } + + if ($shared_row == 14 && ($k + 256) < $rowsize) + { + $out .= "10:-:-:-:1 LOP.XOR loadDeltas, loadDeltas, 4096;\n"; + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + $out .= sprintf "--:-:%d:-:1 LDS.U.128 delta%dr, [loadDeltas];\n", $set_flag, $write_buffer; + } + + if ($shared_row == 15 && ($k + 256) < $rowsize) + { + $out .= sprintf "--:-:%d:-:1 LDS.U.128 delta%dr, [loadDeltas + 4x<4*16>];\n\n", $set_flag, $write_buffer; + } + + foreach my $row (0 .. 2) + { + my $weight = ($row * 72) + $weight_index; + + foreach my $col (0 .. 3) + { + my $accum = ($row * 4) + $col; + my $wait = "--"; + my $stall = 1; + if ($accum == 0) + { + if ($weight_index == 0) + { + $wait = sprintf "%02x", (0x20 | (1 << ($wait_flag - 1))); + } + else + { + $wait = sprintf "%02x", (1 << ($wait_flag - 1)); + } + } + + if ($row == 2 && $col == 3) + { + if ($shared_row < 13 && ($k + (16 * ($shared_row + 3))) < $rowsize) + { + $stall = 0; + } + elsif ($shared_row == 14 && ($k + 256) < $rowsize) + { + $stall = 0; + } + } + + $out .= sprintf "%s:-:-:-:%d FFMA accum%02d, weight%03d, delta%dr%d, accum%02d;\n", $wait, $stall, $accum, $weight, $read_buffer, $col, $accum; + } + } + + $weight_index++; + } + + $wait_flag += 1; + $set_flag += 1; + $read_buffer += 1; + $write_buffer += 1; + if($wait_flag == 5) + { + $wait_flag = 2; + } + if($set_flag == 5) + { + $set_flag = 2; + } + if($read_buffer == 3) + { + $read_buffer = 0; + } + if($write_buffer == 3) + { + $write_buffer = 0; + } + } + } + + return $out; + + +//Load hidden states +--:-:-:-:6 IADD hOffset, rowOffset, warpTid; +--:-:-:-:6 XMAD hOffset, hOffset, param_ldh, timeStep; +--:-:-:-:6 LEA hAddr0.CC, hOffset, param_h[0], 4; +--:-:-:-:2 LEA.HI.X hAddr1, hOffset, param_h[1], RZ, 4; +--:-:5:-:1 @P0 LDG.E.CI.128 h, [hAddr]; + +//Reduction between threads +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 1, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 1, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 1, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 1, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 1, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 2, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 2, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 2, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 2, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 2, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 2, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 2, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 2, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 4, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 4, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 4, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 4, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 4, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 4, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 4, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 4, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 8, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 8, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 8, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 8, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 8, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 8, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 8, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 8, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:5 MOV reluclip, param_reluclip; + +//Compute store pointer + +--:-:-:-:1 IADD hRow, rowOffset, warpTid; +--:-:-:-:1 XMAD storeIndex, hRow, param_ldd, timeStep; +--:-:-:-:1 LEA dAddr0.CC, storeIndex, param_d[0], 4; +--:-:-:-:1 LEA.HI.X dAddr1, storeIndex, param_d[1], RZ, 4; +--:-:-:-:1 LEA lockAddr0, timeStep, param_lockAddr[0], 2; +--:-:-:-:1 LEA.HI.X lockAddr1, timeStep, param_lockAddr[1], RZ, 2; + +//Conditional select for output +//TODO: make sure scheduler orders these such that first one waits on barrier +20:-:-:-:1 @P2 FADD output0, output0, accum00; +20:-:-:-:1 @P3 FADD output0, output0, accum04; +20:-:-:-:1 @P4 FADD output0, output0, accum08; + +20:-:-:-:1 @P2 FADD output1, output1, accum01; +20:-:-:-:1 @P3 FADD output1, output1, accum05; +20:-:-:-:1 @P4 FADD output1, output1, accum09; + +20:-:-:-:1 @P2 FADD output2, output2, accum02; +20:-:-:-:1 @P3 FADD output2, output2, accum06; +20:-:-:-:1 @P4 FADD output2, output2, accum10; + +20:-:-:-:1 @P2 FADD output3, output3, accum03; +20:-:-:-:1 @P3 FADD output3, output3, accum07; +20:-:-:-:3 @P4 FADD output3, output3, accum11; + + +//Save select predicates +//TODO: how many stall cycles needed here? +--:-:-:-:6 P2R predSave, PR, RZ, 0x1e; + +//Multiply by bprop for reclinclip activation function +//TODO: others + +10:-:-:-:1 FSETP.LT.AND P2, PT, RZ, h0, PT; +10:-:-:-:1 FSETP.LT.AND P3, PT, RZ, h1, PT; +10:-:-:-:1 FSETP.LT.AND P4, PT, RZ, h2, PT; +10:-:-:-:1 FSETP.LT.AND P5, PT, RZ, h3, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, h0, reluclip, P2; +--:-:-:-:1 FSETP.LT.AND P3, PT, h1, reluclip, P3; +--:-:-:-:1 FSETP.LT.AND P4, PT, h2, reluclip, P4; +--:-:-:-:1 FSETP.LT.AND P5, PT, h3, reluclip, P5; +--:-:-:-:1 @!P2 FMUL output0, output0, RZ; +--:-:-:-:1 @!P3 FMUL output1, output1, RZ; +--:-:-:-:1 @!P4 FMUL output2, output2, RZ; +--:-:-:-:1 @!P5 FMUL output3, output3, RZ; + +//Update timestep +--:-:-:-:1 ISETP.EQ.AND P1, PT, RZ, param_reverse, PT; +--:-:-:-:1 @P1 MOV setVal, 1; +--:-:-:-:1 @!P1 MOV setVal, -1; +--:-:-:-:1 @P1 MOV expectVal, param_seqLength; +--:-:-:-:1 @!P1 MOV expectVal, -1; +--:-:-:-:1 IADD timeStep, timeStep, setVal; + + +//Conditional store +--:-:-:-:5 @P0 STG.E.CI.128 [dAddr], output; + +//Compute predicate for time unrolling loop +--:-:-:Y:d ISETP.NE.AND P5, PT, timeStep, expectVal, PT; + +//P2 = (tid != 0) +//setVal = 1 +--:-:-:-:1 ISETP.NE.AND P2, PT, tid, RZ, PT; +--:-:-:-:1 MOV expectVal, param_numBlks; +--:-:-:Y:b MOV setVal, 1; + +//Barrier for all blocks +--:-:-:-:f MEMBAR.GL; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:2 SSY SSY_TARGET1; +--:-:-:-:d @P2 SYNC; + +--:-:-:Y:2 ATOM.E.ADD RZ, [lockAddr], setVal; +--:-:-:-:d SYNC; + +SSY_TARGET1: +--:-:-:-:1 SSY SSY_TARGET2; +--:-:-:-:d @P2 SYNC; + +SPINLOCK: +--:-:1:Y:2 LDG.E lockVal, [lockAddr]; +01:-:-:Y:d ISETP.NE.AND P2, PT, lockVal, expectVal, PT; +--:-:-:-:5 @P2 BRA.U SPINLOCK; +--:-:-:-:d SYNC; + +SSY_TARGET2: +--:-:-:-:5 BAR.SYNC 0; + +//Restore select predicates +--:-:-:-:1 R2P PR, predSave, 0x1e; + +//Conditional branch back to beginning of loop +--:-:-:Y:5 @P5 BRA.U UNROLLING_LOOP; + +--:-:-:-:5 EXIT; diff --git a/Kernel/Convolution/Maxwell/persistent_rnn_fprop.sass b/Kernel/Convolution/Maxwell/persistent_rnn_fprop.sass new file mode 100644 index 0000000..6a11539 --- /dev/null +++ b/Kernel/Convolution/Maxwell/persistent_rnn_fprop.sass @@ -0,0 +1,653 @@ +# Kernel: presistent_birnn + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(64*48)> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_h[0] : c[0x0][0x140] + param_h[1] : c[0x0][0x144] + param_hprev[0] : c[0x0][0x148] + param_hprev[1] : c[0x0][0x14c] + param_bias[0] : c[0x0][0x150] + param_bias[1] : c[0x0][0x154] + param_w[0] : c[0x0][0x158] + param_w[1] : c[0x0][0x15c] + param_lockAddr[0] : c[0x0][0x160] + param_lockAddr[1] : c[0x0][0x164] + param_ldh : c[0x0][0x168] + param_ldw : c[0x0][0x16c] + param_bsz : c[0x0][0x170] + param_seqLength : c[0x0][0x174] + param_numBlks : c[0x0][0x178] + param_rowSize : c[0x0][0x17c] + param_reverse : c[0x0][0x180] + param_reluclip : c[0x0][0x184] + + + + + 0-215 : weight<000-215> + 216-227 : accum<00-11> + 228-229 : timeStep, biasValue + 230-232 : warpTid, rowOffset, tid + + 233 : bid + + 236-243 : wAddr0r<0-1>, wAddr1r<0-1>, wAddr2r<0-1>, biasAddr<0-1> + 244-254 ~ ldw, wRow, warpTid4, loadRow, warpIndex, storeWeights, loadWeights, rowSize + + 233 : hOffset + 233 : ldh + 234-239 : hprevAddr<0-1>, loadBuffer<0-3> + 240-251 : hidden0r<0-3>, hidden1r<0-3>, hidden2r<0-3> + 252-254 ~ loadHiddens, storeHiddens, loadIndex + + 240-251 : peerR0V<0-3>, peerR1V<0-3>, peerR2V<0-3> + + 240-249 : output<0-3>, hAddr<0-1>, lockAddr<0-1>, expectVal, setVal + 250-254 ~ storeIndex, hRow, predSave, lockVal, reluclip + + + +//Get tid/block id +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R bid, SR_CTAID.X; + +//Store zeros at addr_zero +--:-:-:-:1 STS.128 [addr_zero], RZ; + + +--:-:-:-:1 MOV ldw, param_ldw; +--:-:-:-:1 MOV rowSize, param_rowSize; + +//timeStep = (param_reverse == 0) ? 0 : param_seqLength +--:-:-:-:1 ISETP.EQ.AND P2, PT, RZ, param_reverse, PT; +--:-:-:-:1 SEL timeStep, RZ, param_seqLength, P2; +--:-:-:-:1 @!P2 IADD timeStep, timeStep, -1; + +//warpIndex = threadIdx.x >> 5 +01:-:-:-:1 SHR.U32 warpIndex, tid, 5; + +//warpTid = threadIdx.x & 0x1f +01:-:-:-:1 LOP.AND warpTid, tid, 0x1f; + +//rowOffset = ((blockIdx.x << 3) + warp_index) * 6 +02:-:-:-:1 SHL rowOffset, bid, 3; +--:-:-:-:1 IADD rowOffset, rowOffset, warpIndex; +--:-:-:-:1 XMAD rowOffset, rowOffset, 6, RZ; + +//if(warp_tid > 15) rowOffset += 3 +--:-:-:-:1 ISETP.GT.AND P1, PT, warpTid, 15, PT; +--:-:-:-:1 @P1 IADD rowOffset, rowOffset, 3; + +//warpTid = warpTid & 0x0f +--:-:-:-:1 LOP.AND warpTid, warpTid, 0x0f; +--:-:-:-:1 ISETP.LT.AND P0, PT, warpTid, 3, PT; + +//warpTid4 = warpTid << 2 +--:-:-:-:1 SHL warpTid4, warpTid, 2; + +//storeWeights = ((P1) ? (warpTid4 + 3*64) : warpTid4) << 2 +//loadWeights = ((P1) ? (warpTid + 3*64) : warpTid) << 2 +--:-:-:-:1 @P1 MOV loadWeights, 3; +--:-:-:-:1 @!P1 MOV loadWeights, RZ; + +--:-:-:-:1 XMAD loadWeights, warpIndex, 6, loadWeights; +--:-:-:-:1 SHL loadWeights, loadWeights, 6; + +--:-:-:-:1 IADD storeWeights, loadWeights, warpTid4; +--:-:-:-:1 IADD loadWeights, loadWeights, warpTid; +--:-:-:-:1 SHL storeWeights, storeWeights, 2; +--:-:-:-:1 SHL loadWeights, loadWeights, 2; + +//wRow = rowOffset * ldw + warpTid +--:-:-:-:1 XMAD wRow, rowOffset, ldw, warpTid4; + +//wAddr0r = &w[wRow] +--:-:-:-:1 LEA wAddr0r0.CC, wRow, param_w[0], 2; +--:-:-:-:1 LEA.HI.X wAddr0r1, wRow, param_w[1], RZ, 2; + +//ldw = ldw << 2 +--:-:-:-:1 SHL ldw, ldw, 2; + +//wAddr1r = wAddr0r + ldw +--:-:-:-:1 IADD wAddr1r0.CC, wAddr0r0, ldw; +--:-:-:-:1 IADD.X wAddr1r1, wAddr0r1, RZ; + +//wAddr2r = wAddr2r + ldw +--:-:-:-:1 IADD wAddr2r0.CC, wAddr1r0, ldw; +--:-:-:-:1 IADD.X wAddr2r1, wAddr1r1, RZ; + +//Compute row loading predicates +--:-:-:-:1 ISETP.LT.AND P1, PT, warpTid4, rowSize, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, rowOffset, rowSize, P1; +--:-:-:-:1 IADD rowSize, rowSize, -1; +--:-:-:-:1 ISETP.LT.AND P4, PT, rowOffset, rowSize, P1; +--:-:-:-:1 IADD rowSize, rowSize, -1; +--:-:-:-:1 ISETP.LT.AND P5, PT, rowOffset, rowSize, P1; + + +--:-:-:Y:c NOP; + +//Load weights to registers + + my $out; + my $regId = 0; + my $rowsize = 1152; + + for (my $col=0; $col < $rowsize; $col += 64) + { + $out .= "--:-:-:-:1 IADD warpTid4, warpTid4, 64;\n"; + + #Use vector loads from weight matrix + $regId = $col / 16; + $out .= sprintf "--:-:1:-:1 \@P3 LDG.E.128 weight%03d, [wAddr0r + 4x<%d>];\n", $regId, $col; + $out .= sprintf "--:-:1:-:1 \@!P3 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + $regId = $col / 16 + 72; + $out .= sprintf "--:-:2:-:1 \@P4 LDG.E.128 weight%03d, [wAddr1r + 4x<%d>];\n", $regId, $col; + $out .= sprintf "--:-:2:-:1 \@!P4 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + $regId = $col / 16 + 144; + $out .= sprintf "--:-:3:-:1 \@P5 LDG.E.128 weight%03d, [wAddr2r + 4x<%d>];\n", $regId, $col; + $out .= sprintf "--:-:3:-:1 \@!P5 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + + $out .= "--:-:-:-:1 ISETP.LT.AND P3, PT, warpTid4, rowSize, P3;\n"; + $out .= "--:-:-:-:1 ISETP.LT.AND P4, PT, warpTid4, rowSize, P4;\n"; + $out .= "--:-:-:-:1 ISETP.LT.AND P5, PT, warpTid4, rowSize, P5;\n"; + + #Store weights into shared memory + if ($col > 0) + { + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + } + + $regId = $col / 16; + $out .= sprintf "01:-:-:-:1 STS.U.128 [storeWeights], weight%03d;\n", $regId; + $regId = $col / 16 + 72; + $out .= sprintf "02:-:-:-:1 STS.U.128 [storeWeights + 4x<64>], weight%03d;\n", $regId; + $regId = $col / 16 + 144; + $out .= sprintf "04:-:-:-:1 STS.U.128 [storeWeights + 4x<128>], weight%03d;\n", $regId; + + #Load each weight from shared mem + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + + foreach my $shared_col (0 .. 3) + { + foreach my $row (0 .. 2) + { + my $control; + + if (($col + 64) >= $rowsize && $row == 2 && $shared_col == 3) + { + $control = "--:1:6:-:2"; + } + else + { + $control = "--:-:-:-:1"; + } + + $regId = ($row * 72) + ($col / 16) + $shared_col; + my $shared_offset = ($row * 64) + ($shared_col * 16); + $out .= sprintf "%s LDS.U weight%03d, [loadWeights + 4x<%d>];\n", $control, $regId, $shared_offset; + } + } + } + + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + + return $out; + + + +//Conditional load of bias + +01:-:-:-:1 IADD loadRow, rowOffset, warpTid; +--:-:-:-:1 ISETP.LT.AND P0, PT, loadRow, param_rowSize, P0; +--:-:-:-:1 LEA biasAddr0.CC, loadRow, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X biasAddr1, loadRow, param_bias[1], RZ, 2; +--:-:-:-:1 @P0 LDG.E biasValue, [biasAddr]; +--:-:-:-:1 @!P0 MOV biasValue, RZ; + + +//Predicates for store code +--:-:-:-:1 ISETP.EQ.AND P2, PT, warpTid, 0, PT; +--:-:-:-:1 ISETP.EQ.AND P3, PT, warpTid, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P4, PT, warpTid, 2, PT; + +UNROLLING_LOOP: + +//Prime inner product loop by loading first rows of hprev +--:-:-:-:1 MOV loadIndex, tid; + +//storeHiddens = tid << 4 +--:-:-:-:1 SHL storeHiddens, tid, 4; +--:-:-:-:1 SHL loadHiddens, warpTid, 4; + +//hprevAddr = &h_prev[timeStep * ldh + loadIndex] +--:-:-:-:1 XMAD hOffset, loadIndex, param_ldh, timeStep; +--:-:-:-:1 LEA hprevAddr0.CC, hOffset, param_hprev[0], 4; +--:-:-:-:2 LEA.HI.X hprevAddr1, hOffset, param_hprev[1], RZ, 4; + +//loadBuffer = *hprevAddr +--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT; +--:5:1:-:2 @P1 LDG.E.CI.128 loadBuffer, [hprevAddr]; +--:5:1:-:2 @!P1 LDS.U.128 loadBuffer, [addr_zero]; + +//ldh = param_ldh << 12 +--:-:-:-:1 MOV ldh, param_ldh; +--:-:-:-:1 SHL ldh, ldh, 12; + + +//Initialize all accumulation registers to 0 + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 accum%02d, [addr_zero];\n", $_ * 4), 0..2; + + +//Update load index and load address +--:-:-:-:6 IADD loadIndex, loadIndex, 256; +--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT; +10:-:-:-:6 IADD hprevAddr0.CC, hprevAddr0, ldh; +--:-:-:-:6 IADD.X hprevAddr1, hprevAddr1, RZ; + +01:-:-:-:1 STS.U.128 [storeHiddens], loadBuffer; + +//Unrolled GEMM loop + + our @top; + + my $out = join '', @top; + + my $rowsize = 1152; + my $weight_index = 0; + + my $wait_flag = 2; + my $set_flag = 4; + my $read_buffer = 0; + my $write_buffer = 2; + + for (my $k=0; $k < $rowsize; $k+=256) + { + if ($k == 0) + { + $out .= "--:6:1:-:1 \@P1 LDG.E.CI.128 loadBuffer, [hprevAddr];\n"; + $out .= "--:-:1:-:1 \@!P1 LDS.U.128 loadBuffer, [addr_zero];\n\n"; + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + $out .= "--:-:2:-:1 LDS.U.128 hidden0r, [loadHiddens];\n"; + $out .= "--:-:3:-:1 LDS.U.128 hidden1r, [loadHiddens + 4x<4*16>];\n\n"; + } + $out .= "--:-:-:-:1 LOP.XOR storeHiddens, storeHiddens, 4096;\n"; + + foreach my $shared_row (0 .. 15) + { + if($weight_index < 72) + { + if ($shared_row < 14 && ($k + (16 * ($shared_row + 2))) < $rowsize) + { + my $read_bar = "-"; + if ($shared_row == 13 && ($k + 256) < $rowsize) + { + $read_bar = "5"; + } + $out .= sprintf "--:%s:%d:-:1 LDS.U.128 hidden%dr, [loadHiddens + 4x<4*%d>];\n", $read_bar, $set_flag, $write_buffer, (16 * ($shared_row + 2)); + } + + if ($shared_row == 11) + { + $out .= "--:-:-:-:1 IADD loadIndex, loadIndex, 256;\n"; + $out .= "20:-:-:-:1 IADD hprevAddr0.CC, hprevAddr0, ldh;\n"; + } + + if ($shared_row == 12) + { + $out .= "--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;\n"; + $out .= "--:-:-:-:1 IADD.X hprevAddr1, hprevAddr1, RZ;\n"; + } + + if ($shared_row == 13) + { + $out .= "01:-:-:-:1 STS.U.128 [storeHiddens], loadBuffer;\n"; + + if (($k + 512) < $rowsize) + { + $out .= "--:6:1:-:1 \@P1 LDG.E.CI.128 loadBuffer, [hprevAddr];\n"; + $out .= "--:-:1:-:1 \@!P1 LDS.U.128 loadBuffer, [addr_zero];\n\n"; + } + else + { + $out .= "--:-:-:-:6 IADD hOffset, rowOffset, warpTid;\n"; + $out .= "--:-:-:-:6 XMAD hOffset, hOffset, param_ldh, timeStep;\n"; + $out .= "--:-:-:-:6 LEA hprevAddr0.CC, hOffset, param_h[0], 4;\n"; + $out .= "--:-:-:-:2 LEA.HI.X hprevAddr1, hOffset, param_h[1], RZ, 4;\n"; + $out .= "--:-:6:-:1 \@P0 LDG.E.CI.128 loadBuffer, [hprevAddr];\n\n"; + } + } + + if ($shared_row == 14) + { + $out .= "10:-:-:-:1 LOP.XOR loadHiddens, loadHiddens, 4096;\n"; + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + $out .= sprintf "--:-:%d:-:1 LDS.U.128 hidden%dr, [loadHiddens];\n", $set_flag, $write_buffer; + } + + if ($shared_row == 15) + { + $out .= sprintf "--:-:%d:-:1 LDS.U.128 hidden%dr, [loadHiddens + 4x<4*16>];\n\n", $set_flag, $write_buffer; + } + + foreach my $row (0 .. 2) + { + my $weight = ($row * 72) + $weight_index; + + foreach my $col (0 .. 3) + { + my $accum = ($row * 4) + $col; + my $wait = "--"; + my $stall = 1; + if ($accum == 0) + { + if ($weight_index == 0) + { + $wait = sprintf "%02x", (0x20 | (1 << ($wait_flag - 1))); + } + else + { + $wait = sprintf "%02x", (1 << ($wait_flag - 1)); + } + } + + if ($row == 2 && $col == 3) + { + if ($shared_row < 13 && ($k + (16 * ($shared_row + 3))) < $rowsize) + { + $stall = 0; + } + elsif ($shared_row == 14 && ($k + 256) < $rowsize) + { + $stall = 0; + } + } + + $out .= sprintf "%s:-:-:-:%d FFMA accum%02d, weight%03d, hidden%dr%d, accum%02d;\n", $wait, $stall, $accum, $weight, $read_buffer, $col, $accum; + } + } + + $weight_index++; + } + + $wait_flag += 1; + $set_flag += 1; + $read_buffer += 1; + $write_buffer += 1; + if($wait_flag == 5) + { + $wait_flag = 2; + } + if($set_flag == 5) + { + $set_flag = 2; + } + if($read_buffer == 3) + { + $read_buffer = 0; + } + if($write_buffer == 3) + { + $write_buffer = 0; + } + } + } + + return $out; + + +//Reduction between threads +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 1, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 1, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 1, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 1, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 1, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 2, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 2, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 2, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 2, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 2, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 2, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 2, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 2, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 4, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 4, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 4, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 4, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 4, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 4, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 4, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 4, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 8, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 8, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 8, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 8, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 8, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 8, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 8, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 8, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +//Compute store pointer + +--:-:-:-:1 IADD hRow, rowOffset, warpTid; +--:-:-:-:1 XMAD storeIndex, hRow, param_ldh, timeStep; +--:-:-:-:1 LEA hAddr0.CC, storeIndex, param_h[0], 4; +--:-:-:-:1 LEA.HI.X hAddr1, storeIndex, param_h[1], RZ, 4; +--:-:-:-:1 LEA lockAddr0, timeStep, param_lockAddr[0], 2; +--:-:-:-:1 LEA.HI.X lockAddr1, timeStep, param_lockAddr[1], RZ, 2; + +//Conditional select for output +--:-:-:-:1 @P2 MOV output0, accum00; +--:-:-:-:1 @P3 MOV output0, accum04; +--:-:-:-:1 @P4 MOV output0, accum08; + +--:-:-:-:1 @P2 MOV output1, accum01; +--:-:-:-:1 @P3 MOV output1, accum05; +--:-:-:-:1 @P4 MOV output1, accum09; + +--:-:-:-:1 @P2 MOV output2, accum02; +--:-:-:-:1 @P3 MOV output2, accum06; +--:-:-:-:1 @P4 MOV output2, accum10; + +--:-:-:-:1 @P2 MOV output3, accum03; +--:-:-:-:1 @P3 MOV output3, accum07; +--:-:-:-:3 @P4 MOV output3, accum11; + +//Update timestep +--:-:-:-:1 ISETP.EQ.AND P5, PT, RZ, param_reverse, PT; +--:-:-:-:1 @P5 MOV setVal, 1; +--:-:-:-:1 @!P5 MOV setVal, -1; +--:-:-:-:1 @P5 MOV expectVal, param_seqLength; +--:-:-:-:1 @!P5 MOV expectVal, -1; +--:-:-:-:1 IADD timeStep, timeStep, setVal; + + +//Save select predicates +--:-:-:-:1 P2R predSave, PR, RZ, 0x0c; + +--:-:-:-:1 MOV reluclip, param_reluclip; + +//Add bias for output +--:-:-:-:1 FADD output0, output0, biasValue; +--:-:-:-:1 FADD output1, output1, biasValue; +--:-:-:-:1 FADD output2, output2, biasValue; +--:-:-:-:3 FADD output3, output3, biasValue; + +//Accumulate on top of current data +20:-:-:-:1 FADD output0, output0, loadBuffer0; +--:-:-:-:1 FADD output1, output1, loadBuffer1; +--:-:-:-:1 FADD output2, output2, loadBuffer2; +--:-:-:-:3 FADD output3, output3, loadBuffer3; + +//Activation function +//TODO: add others +--:-:-:-:2 FMNMX output0, output0, RZ, !PT; +--:-:-:-:2 FMNMX output1, output1, RZ, !PT; +--:-:-:-:2 FMNMX output2, output2, RZ, !PT; +--:-:-:-:2 FMNMX output3, output3, RZ, !PT; + +--:-:-:-:2 FMNMX output0, output0, reluclip, PT; +--:-:-:-:2 FMNMX output1, output1, reluclip, PT; +--:-:-:-:2 FMNMX output2, output2, reluclip, PT; +--:-:-:-:2 FMNMX output3, output3, reluclip, PT; + +//Conditional store +--:-:-:-:1 @P0 STG.E.CI.128 [hAddr], output; + +//Compute predicate for time unrolling loop +--:-:-:Y:d ISETP.NE.AND P5, PT, timeStep, expectVal, PT; + +//P2 = (tid != 0) +//setVal = 1 +--:-:-:-:1 ISETP.NE.AND P2, PT, tid, RZ, PT; +--:-:-:-:1 MOV expectVal, param_numBlks; +--:-:-:Y:b MOV setVal, 1; + +//Barrier for all blocks +--:-:-:-:f MEMBAR.GL; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:2 SSY SSY_TARGET1; +--:-:-:-:d @P2 SYNC; + +--:-:-:Y:2 ATOM.E.ADD RZ, [lockAddr], setVal; +--:-:-:-:d SYNC; + +SSY_TARGET1: +--:-:-:-:1 SSY SSY_TARGET2; +--:-:-:-:d @P2 SYNC; + +SPINLOCK: +--:-:1:Y:2 LDG.E lockVal, [lockAddr]; +01:-:-:Y:d ISETP.NE.AND P2, PT, lockVal, expectVal, PT; +--:-:-:-:5 @P2 BRA.U SPINLOCK; +--:-:-:-:d SYNC; + +SSY_TARGET2: +--:-:-:-:5 BAR.SYNC 0; + +//Restore select predicates +--:-:-:-:1 R2P PR, predSave, 0x0c; + +//Conditional branch back to beginning of loop +--:-:-:Y:5 @P5 BRA.U UNROLLING_LOOP; + +--:-:-:-:5 EXIT; diff --git a/Kernel/Convolution/Maxwell/sconv_bprop_C1_N64.sass b/Kernel/Convolution/Maxwell/sconv_bprop_C1_N64.sass new file mode 100644 index 0000000..070db8c --- /dev/null +++ b/Kernel/Convolution/Maxwell/sconv_bprop_C1_N64.sass @@ -0,0 +1,600 @@ +# Kernel: sconv_bprop_C32_N64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_lut : 4x<64*4> + + param_I[0] : c[0x0][0x140] + param_I[1] : c[0x0][0x144] + param_E[0] : c[0x0][0x148] + param_E[1] : c[0x0][0x14c] + param_F[0] : c[0x0][0x150] + param_F[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_N : c[0x0][0x15c] + param_K : c[0x0][0x160] + param_D : c[0x0][0x164] + param_H : c[0x0][0x168] + param_W : c[0x0][0x16c] + param_WN : c[0x0][0x170] + param_HWN : c[0x0][0x174] + param_DHWN : c[0x0][0x178] + param_C : c[0x0][0x17c] + param_CRST : c[0x0][0x180] + param_RST : c[0x0][0x184] + param_magic_RST : c[0x0][0x188] + param_shift_RST : c[0x0][0x18c] + param_RS : c[0x0][0x190] + param_magic_RS : c[0x0][0x194] + param_shift_RS : c[0x0][0x198] + param_S : c[0x0][0x19c] + param_magic_S : c[0x0][0x1a0] + param_shift_S : c[0x0][0x1a4] + param_pad_d : c[0x0][0x1a8] + param_pad_h : c[0x0][0x1ac] + param_pad_w : c[0x0][0x1b0] + param_str_d : c[0x0][0x1b4] + param_str_h : c[0x0][0x1b8] + param_str_w : c[0x0][0x1bc] + param_Q : c[0x0][0x1c0] + param_PQ : c[0x0][0x1c4] + param_QN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_MPQN : c[0x0][0x1d0] + param_magic_Q : c[0x0][0x1d4] + param_shift_Q : c[0x0][0x1d8] + param_magic_PQ : c[0x0][0x1dc] + param_shift_PQ : c[0x0][0x1e0] + param_CRST8 : c[0x0][0x1e4] + param_MPQN8 : c[0x0][0x1e8] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkE, blkF, blkMPQ + + 68-119 ~ k<0|4>, tidX, tid1, m, p, q, crst, n, n32, tf<0|4>, te, te<0|4>, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-79 : j0Ex<0-7>, j0Fy<0-7> + 80-95 : j1Ex<0-7>, j1Fy<0-7> + + 96-103 : load0F<0-3>, load4F<0-3> + 104-119 : load0E<0-7>, load4E<0-7> + + 120-123 : track0F<0-1>, track4F<0-1> + 124-127 : track0E<0-1>, track4E<0-1> + + 128-131 ~ writeEs, writeFs, swapBuf, K + 132-136 ~ readEs, readFs, mt, pr, qs + + 68-71 ~ lutStore, sliceI + 72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD + + 72-89 : c<0-7>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1> + 90-132 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkF, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; + + +// tidX = (tid & 7) << 2 +// k = tid >> 3 +01:-:-:-:1 LOP.AND tidX, tid, 7; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 k0, tid, 3; +--:-:-:-:1 IADD k4, k0, 4; + +--:-:-:-:1 MOV K, param_K; + +--:-:-:-:1 STS.128 [RZ], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15; + + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +08:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; + +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, q, param_str_w, RZ; +--:-:-:-:1 IADD mt, mt, -param_pad_d; +--:-:-:-:1 IADD pr, pr, -param_pad_h; +--:-:-:-:1 IADD qs, qs, -param_pad_w; + +// crst = blkF*32 + tidX +// n = blkE*64 + tidX +04:-:-:-:1 ISCADD crst, blkF, tidX, 5; +08:-:-:-:1 ISCADD n, blkE, tidX, 6; +--:-:-:-:1 IADD n32, n, 32; + +// trackF = k*CRST + crst +--:-:-:-:1 XMAD tf0, k0, param_CRST, crst; +--:-:-:-:1 XMAD tf4, k4, param_CRST, crst; +--:-:-:-:1 LEA track0F0.CC, tf0, param_F[0], 2; +--:-:-:-:1 LEA.HI.X track0F1, tf0, param_F[1], RZ, 2; +--:-:-:-:1 LEA track4F0.CC, tf4, param_F[0], 2; +--:-:-:-:1 LEA.HI.X track4F1, tf4, param_F[1], RZ, 2; + +// trackE = k*MPQN + m*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD te, q, param_N, n; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te0, k0, param_MPQN, te; +--:-:-:-:1 XMAD.LO2C te4, k4, param_MPQN, te; +--:-:-:-:1 LEA track0E0.CC, te0, param_E[0], 2; +--:-:-:-:1 LEA.HI.X track0E1, te0, param_E[1], RZ, 2; +--:-:-:-:1 LEA track4E0.CC, te4, param_E[0], 2; +--:-:-:-:1 LEA.HI.X track4E1, te4, param_E[1], RZ, 2; + +// P1 = crst < CRST +// P2 = n < N +// P3 = n+32 < N +--:-:-:-:1 ISETP.LT.AND P1, PT, crst, param_CRST, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, n, param_N, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, n32, param_N, PT; + +// writeFs = (32*k + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, k0, tidX, 5; +--:-:-:-:1 SHL writeFs, writeFs, 2; +// writeEs = (64*k + tidX) * 4 +--:-:-:-:1 ISCADD writeEs, k0, tidX, 6; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x<32*8>, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readEs = ((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readEs, readEs, 4x<32*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, 4x<32*8 + 64*8>; + + +--:-:-:-:0 IADD K, K, -8; + +--:-:-:-:1 @P1 LDG.E.CI load0F0, [track0F + 4x<0>]; +--:-:-:-:1 @P1 LDG.E.CI load0F1, [track0F + 4x<1>]; +--:-:-:-:1 @P1 LDG.E.CI load0F2, [track0F + 4x<2>]; +--:-:1:-:1 @P1 LDG.E.CI load0F3, [track0F + 4x<3>]; + +--:-:-:-:1 @P1 LDG.E.CI load4F0, [track4F + 4x<0>]; +--:-:-:-:1 @P1 LDG.E.CI load4F1, [track4F + 4x<1>]; +--:-:-:-:1 @P1 LDG.E.CI load4F2, [track4F + 4x<2>]; +--:-:2:-:1 @P1 LDG.E.CI load4F3, [track4F + 4x<3>]; + +--:-:-:-:0 ISETP.GT.AND P1, PT, K, RZ, P1; + +--:-:3:-:1 @P2 LDG.E.128 load0E0, [track0E + 4x< 0>]; +--:-:4:-:1 @P3 LDG.E.128 load0E4, [track0E + 4x<32>]; +--:-:5:-:1 @P2 LDG.E.128 load4E0, [track4E + 4x< 0>]; +--:-:6:-:1 @P3 LDG.E.128 load4E4, [track4E + 4x<32>]; + +--:-:-:-:2 ISETP.GT.AND P2, PT, K, RZ, P2; +--:-:-:-:0 ISETP.GT.AND P3, PT, K, RZ, P3; + +01:-:-:-:1 STS.128 [writeFs + 4x<0*32>], load0F; +--:-:-:-:6 IADD track0F0.CC, track0F0, param_CRST8; +--:-:-:-:0 IADD.X track0F1, track0F1, RZ; + +02:-:-:-:1 STS.128 [writeFs + 4x<4*32>], load4F; +--:-:-:-:6 IADD track4F0.CC, track4F0, param_CRST8; +--:-:-:-:0 IADD.X track4F1, track4F1, RZ; + +04:-:-:-:1 STS.128 [writeEs + 4x<0*64 + 0>], load0E0; +08:-:-:-:1 STS.128 [writeEs + 4x<0*64 + 32>], load0E4; +--:-:-:-:6 IADD track0E0.CC, track0E0, param_MPQN8; +--:-:-:-:0 IADD.X track0E1, track0E1, RZ; + +10:-:-:-:1 STS.128 [writeEs + 4x<4*64 + 0>], load4E0; +20:1:-:-:1 STS.128 [writeEs + 4x<4*64 + 32>], load4E4; +--:-:-:-:6 IADD track4E0.CC, track4E0, param_MPQN8; +--:-:-:-:1 IADD.X track4E1, track4E1, RZ; + +01:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD writeFs, writeFs, swapBuf; +--:-:-:-:2 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD K, K, -8; + +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>]; + +--:-:-:-:1 @P1 LDG.E.CI load0F0, [track0F + 4x<0>]; +--:-:-:-:1 @P1 LDG.E.CI load0F1, [track0F + 4x<1>]; +--:-:-:-:1 @P1 LDG.E.CI load0F2, [track0F + 4x<2>]; +--:-:2:-:1 @P1 LDG.E.CI load0F3, [track0F + 4x<3>]; + +--:-:-:-:1 @P1 LDG.E.CI load4F0, [track4F + 4x<0>]; +--:-:-:-:1 @P1 LDG.E.CI load4F1, [track4F + 4x<1>]; +--:-:-:-:1 @P1 LDG.E.CI load4F2, [track4F + 4x<2>]; +--:-:3:-:1 @P1 LDG.E.CI load4F3, [track4F + 4x<3>]; + +--:-:-:-:0 ISETP.GT.AND P1, PT, K, RZ, P1; + +--:-:4:-:1 @P2 LDG.E.128 load0E0, [track0E + 4x< 0>]; +--:-:4:-:1 @P3 LDG.E.128 load0E4, [track0E + 4x<32>]; +--:-:5:-:1 @P2 LDG.E.128 load4E0, [track4E + 4x< 0>]; +--:-:5:-:1 @P3 LDG.E.128 load4E4, [track4E + 4x<32>]; + +--:-:-:-:2 ISETP.GT.AND P2, PT, K, RZ, P2; +--:-:-:-:1 ISETP.GT.AND P3, PT, K, RZ, P3; + +NEXT_8K: +--:-:-:-:1 ISETP.GT.AND P0, PT, K, -8, PT; + + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD K, K, -8;\n", + + j0c12 => "02:2:-:-:1 \@P0 STS.128 [writeFs + 4x<0*32>], load0F;\n", + j0c14 => "--:-:-:-:1 \@P0 IADD track0F0.CC, track0F0, param_CRST8;\n", + j0c19 => "--:-:-:-:1 \@P0 IADD.X track0F1, track0F1, RZ;\n", + + j0c56 => "02:-:-:-:1 \@P1 LDG.E.CI load0F0, [track0F + 4x<0>];\n", + j0c58 => "--:-:-:-:1 \@P1 LDG.E.CI load0F1, [track0F + 4x<1>];\n", + j0c60 => "--:-:-:-:1 \@P1 LDG.E.CI load0F2, [track0F + 4x<2>];\n", + j0c62 => "--:-:2:-:1 \@P1 LDG.E.CI load0F3, [track0F + 4x<3>];\n", + + j2c12 => "04:3:-:-:1 \@P0 STS.128 [writeFs + 4x<4*32>], load4F;\n", + j2c14 => "--:-:-:-:1 \@P0 IADD track4F0.CC, track4F0, param_CRST8;\n", + j2c19 => "--:-:-:-:1 \@P0 IADD.X track4F1, track4F1, RZ;\n", + + j2c56 => "04:-:-:-:1 \@P1 LDG.E.CI load4F0, [track4F + 4x<0>];\n", + j2c58 => "--:-:-:-:1 \@P1 LDG.E.CI load4F1, [track4F + 4x<1>];\n", + j2c60 => "--:-:-:-:1 \@P1 LDG.E.CI load4F2, [track4F + 4x<2>];\n", + j2c62 => "--:-:3:-:1 \@P1 LDG.E.CI load4F3, [track4F + 4x<3>];\n", + + j4c12 => "08:-:-:-:1 \@P0 STS.128 [writeEs + 4x<0*64 + 0>], load0E0;\n", + j4c14 => "--:4:-:-:1 \@P0 STS.128 [writeEs + 4x<0*64 + 32>], load0E4;\n", + j4c16 => "--:-:-:-:1 \@P0 IADD track0E0.CC, track0E0, param_MPQN8;\n", + j4c21 => "--:-:-:-:1 \@P0 IADD.X track0E1, track0E1, RZ;\n", + + j4c60 => "08:-:-:-:1 \@P2 LDG.E.128 load0E0, [track0E + 4x< 0>];\n", + j4c62 => "--:-:4:-:1 \@P3 LDG.E.128 load0E4, [track0E + 4x<32>];\n", + + j6c12 => "10:-:-:-:1 \@P0 STS.128 [writeEs + 4x<4*64 + 0>], load4E0;\n", + j6c14 => "--:5:-:-:1 \@P0 STS.128 [writeEs + 4x<4*64 + 32>], load4E4;\n", + j6c16 => "--:-:-:-:1 \@P0 IADD track4E0.CC, track4E0, param_MPQN8;\n", + j6c21 => "--:-:-:-:1 \@P0 IADD.X track4E1, track4E1, RZ;\n", + + j6c60 => "10:-:-:-:1 \@P2 LDG.E.128 load4E0, [track4E + 4x< 0>];\n", + j6c62 => "--:-:5:-:1 \@P3 LDG.E.128 load4E4, [track4E + 4x<32>];\n", + + j6c63 => "--:-:-:-:1 \@P0 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c8 => "--:-:-:-:1 ISETP.GT.AND P1, PT, K, RZ, P1;\n", + j7c10 => "--:-:-:-:1 ISETP.GT.AND P2, PT, K, RZ, PT;\n", + j7c12 => "--:-:-:-:1 ISETP.GT.AND P3, PT, K, RZ, PT;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_8K;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:-:-:0 MOV warp_cnt, 32; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkF, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +01:-:-:-:6 MOV rst, tid; + +LUT_LOOP: + + +// warp synchronous loop while warp_cnt < RST (c=0) +--:-:-:-:1 ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT; +--:-:-:-:1 IADD warp_cnt, warp_cnt, 32; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +--:-:-:-:1 IADD z, mt, t; +--:-:-:-:1 IADD y, pr, r; +--:-:-:-:1 IADD x, qs, s; +// i = (z*HWN + y*WN + x*N) * 4 +20:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD sliceI, x, param_N, sliceI; +--:-:-:-:1 SHL sliceI, sliceI, 2; +// Bounds check x and y, and make i negative if outside +--:-:-:-:1 ISET.LT.AND x0, x, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW, x, param_W, PT; +--:-:-:-:1 ISET.LT.AND y0, y, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH, y, param_H, PT; +--:-:-:-:1 ISET.LT.AND z0, z, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD, z, param_D, PT; +--:-:-:-:1 LOP3.LUT sliceI, sliceI, x0, xW, 0xfe; + +--:-:-:-:1 LOP3.LUT sliceI, sliceI, y0, yH, 0xfe; +--:-:-:-:1 SHL lutStore, rst, 2; +--:-:-:-:1 IADD rst, rst, 32; + +--:-:-:-:1 LOP3.LUT sliceI, sliceI, z0, zD, 0xfe; +// Store i imgOffset into the shared lookup table +--:6:-:-:1 STS [lutStore + addr_lut], sliceI; + + +--:-:-:-:5 @P0 BRA.U LUT_LOOP; + + + +--:-:-:-:1 MOV RST, param_RST; +--:-:-:-:1 MOV DHWN1, param_DHWN; +--:-:-:-:1 SHL DHWN1, DHWN1, 2; + +--:-:-:-:1 LOP.AND readEs, readEs, 0x7f; +--:-:-:-:1 LOP.AND readFs, readFs, 0x3f; + +// writeCs = ((readIs / 4) * 64 + readEs); +--:-:-:-:1 ISCADD writeCs, readFs, readEs, 4; + +// readCs = (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL readCs, tid31, 2; + +// nn = blkE*64 + tid31; +04:-:-:-:1 ISCADD nn, blkE, tid31, 6; + +// crst = blkF*32 +02:-:-:-:1 SHL crst00, blkF, 5; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + +--:-:-:-:1 LEA trackI0.CC, nn, param_I[0], 2; +--:-:-:-:1 LEA.HI.X trackI1, nn, param_I[1], RZ, 2; + +// n < N +--:-:-:-:1 ISETP.LT.AND P5, PT, nn, param_N, PT; +--:-:-:-:1 IADD nn, nn, 32; +--:-:-:-:1 ISETP.LT.AND P6, PT, nn, param_N, PT; + +--:-:-:-:1 MOV alpha, param_alpha; + + + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:1 IADD crst00, crst00, 12;\n" . + "--:-:-:-:1 IADD crst04, crst04, 12;\n" . + "--:-:-:-:1 IADD crst08, crst08, 12;\n" . + "--:-:-:-:1 IADD crst12, crst12, 12;\n" if $y == 4; + + $out .= sprintf( + "01:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "02:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "04:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "08:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], c4; + +--:-:-:-:1 LDS c0, [readCs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS c1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<1*64 + 00>]; +--:-:-:-:1 LDS c3, [readCs + 4x<1*64 + 32>]; +--:-:-:-:1 LDS c4, [readCs + 4x<2*64 + 00>]; +--:-:-:-:1 LDS c5, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 LDS c6, [readCs + 4x<3*64 + 00>]; +--:-:-:-:1 LDS c7, [readCs + 4x<3*64 + 32>]; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; + +--:-:-:-:1 XMAD.LO2C c00, crst00, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c04, crst04, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c08, crst08, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c12, crst12, param_magic_RST, RZ; + +--:-:-:-:1 SHR.U32 c00, c00, param_shift_RST; +--:-:-:-:1 SHR.U32 c04, c04, param_shift_RST; +--:-:-:-:1 SHR.U32 c08, c08, param_shift_RST; +--:-:-:-:1 SHR.U32 c12, c12, param_shift_RST; + +--:-:-:-:1 VMAD.U16.U16 lut00, -c00, RST, crst00; +--:-:-:-:1 VMAD.U16.U16 lut04, -c04, RST, crst04; +--:-:-:-:1 VMAD.U16.U16 lut08, -c08, RST, crst08; +--:-:-:-:1 VMAD.U16.U16 lut12, -c12, RST, crst12; + +--:-:-:-:1 SHL lut00, lut00, 2; +--:-:-:-:1 SHL lut04, lut04, 2; +--:-:-:-:1 SHL lut08, lut08, 2; +--:-:-:-:1 SHL lut12, lut12, 2; + +--:-:-:-:1 XMAD.LO2 chan00, DHWN1, c00, RZ; +--:-:-:-:1 XMAD.LO2 chan04, DHWN1, c04, RZ; +--:-:-:-:1 XMAD.LO2 chan08, DHWN1, c08, RZ; +--:-:-:-:1 XMAD.LO2 chan12, DHWN1, c12, RZ; + +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 IADD crst12, crst12, 1; + +--:-:1:-:1 @P0 LDS img00, [lut00 + addr_lut]; +--:-:2:-:1 @P1 LDS img04, [lut04 + addr_lut]; +--:-:3:-:1 @P2 LDS img08, [lut08 + addr_lut]; +--:-:4:-:1 @P3 LDS img12, [lut12 + addr_lut]; + + + +01:-:-:-:1 IADD3 track00I0.CC, trackI0, img00, chan00; +--:-:-:-:5 ISETP.GE.AND P0, PT, img00, RZ, P0; +--:-:-:-:1 IADD.X track00I1, trackI1, RZ; + +02:-:-:-:1 IADD3 track04I0.CC, trackI0, img04, chan04; +--:-:-:-:5 ISETP.GE.AND P1, PT, img04, RZ, P1; +--:-:-:-:1 IADD.X track04I1, trackI1, RZ; + +04:-:-:-:1 IADD3 track08I0.CC, trackI0, img08, chan08; +--:-:-:-:5 ISETP.GE.AND P2, PT, img08, RZ, P2; +--:-:-:-:1 IADD.X track08I1, trackI1, RZ; + +08:-:-:-:1 IADD3 track12I0.CC, trackI0, img12, chan12; +--:-:-:-:5 ISETP.GE.AND P3, PT, img12, RZ, P3; +--:-:-:-:0 IADD.X track12I1, trackI1, RZ; + +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00I], c0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04I], c2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08I], c4; +--:-:-:-:3 PSETP.AND.AND P2, PT, P2, P6, PT; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12I], c6; +--:-:-:-:5 PSETP.AND.AND P3, PT, P3, P6, PT; + +--:1:-:-:2 @P0 RED.E.ADD.F32.FTZ.RN [track00I + 4x<32>], c1; +--:2:-:-:2 @P1 RED.E.ADD.F32.FTZ.RN [track04I + 4x<32>], c3; +--:3:-:-:4 @P2 RED.E.ADD.F32.FTZ.RN [track08I + 4x<32>], c5; +--:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12I + 4x<32>], c7; + +--:-:-:-:5 RET; + diff --git a/Kernel/Convolution/Maxwell/sconv_updat_C128_K128.sass b/Kernel/Convolution/Maxwell/sconv_updat_C128_K128.sass new file mode 100644 index 0000000..dfb6bea --- /dev/null +++ b/Kernel/Convolution/Maxwell/sconv_updat_C128_K128.sass @@ -0,0 +1,718 @@ +# Kernel: sconv_updat_C128_K128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*4 + 0> + addr_m : 4x<(128*16 + 32)*4 + 4> + addr_q : 4x<(128*16 + 32)*4 + 5> + szBuf : (128*16 + 32) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_offset_K : c[0x0][0x15c] + param_N : c[0x0][0x160] + param_K : c[0x0][0x164] + param_D : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_WN : c[0x0][0x174] + param_HWN : c[0x0][0x178] + param_DHWN : c[0x0][0x17c] + param_C : c[0x0][0x180] + param_CRST : c[0x0][0x184] + param_RST : c[0x0][0x188] + param_magic_RST : c[0x0][0x18c] + param_shift_RST : c[0x0][0x190] + param_RS : c[0x0][0x194] + param_magic_RS : c[0x0][0x198] + param_shift_RS : c[0x0][0x19c] + param_S : c[0x0][0x1a0] + param_magic_S : c[0x0][0x1a4] + param_shift_S : c[0x0][0x1a8] + param_pad_d : c[0x0][0x1ac] + param_pad_h : c[0x0][0x1b0] + param_pad_w : c[0x0][0x1b4] + param_str_d : c[0x0][0x1b8] + param_str_h : c[0x0][0x1bc] + param_str_w : c[0x0][0x1c0] + param_dil_d : c[0x0][0x1c4] + param_dil_h : c[0x0][0x1c8] + param_dil_w : c[0x0][0x1cc] + param_P : c[0x0][0x1d0] + param_Q : c[0x0][0x1d4] + param_PQ : c[0x0][0x1d8] + param_QN : c[0x0][0x1dc] + param_PQN : c[0x0][0x1e0] + param_MPQN : c[0x0][0x1e4] + param_magic_Q : c[0x0][0x1e8] + param_shift_Q : c[0x0][0x1ec] + param_magic_PQ : c[0x0][0x1f0] + param_shift_PQ : c[0x0][0x1f4] + param_grid_P : c[0x0][0x1f8] + param_grid_Q : c[0x0][0x1fc] + param_grid_PQ : c[0x0][0x200] + param_CRSTK : c[0x0][0x204] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkI, blkE, one + 68-111 ~ tidX, tidY, tid1, tid7, tid128, shiftX, blkMPQ, m, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-95 ~ tidYY, mm, mt, pr, y, z, y0, yH, z0, zD, bounds_yz, c, r, t, rs, rst + 64-95 ~ qs, x, x0, xW, bounds_x, ti, te, Q + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 96-111 : loadI<0-7>, loadE<0-7> + 112-115 : trackI<0-1>, trackE<0-1> + + 116-124 ~ writeS, loopN, e, i, p, q, k, crst, s + 125-127 ~ swapBuf, readIs, readEs + + 68-83 : c<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 84-124 ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128, blk_MPQ, CRSTK, xmad_determ + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID1; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkI, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; +--:-:-:-:5 BRA.U END_CTAID1; +CTAID1: +--:-:2:-:1 S2R blkMPQ, SR_CTAID.Z; +--:-:3:-:1 S2R blkI, SR_CTAID.X; +--:-:4:-:1 S2R blkE, SR_CTAID.Y; +END_CTAID1: + + +// tidX = tid >> 1 +// tidY = (tid & 1) << 2 +// shiftX = (tid & 1) << 4 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tid1, 2; +--:-:-:-:1 SHL shiftX, tid1, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +--:-:-:-:1 PSETP.AND.AND P0, PT, PT, PT, PT; + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_grid_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_grid_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; + +// We need to be able to restore m and q at each P iteration +// Register spill to shared +--:1:-:-:1 STS [addr_m], m; +--:-:-:-:1 STS [addr_q], q; + +// writeBs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 IADD writeS, writeS, shiftX; +--:-:-:-:1 ISCADD writeS, writeS, 4x, 2; + +// readIs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readIs, tid, 0x70; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readEs, tid128, 4; +--:-:-:-:1 LOP.OR readEs, readEs, tid7; +--:-:-:-:1 ISCADD readEs, readEs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + +// crst = blockI*128 + tid +04:-:-:-:1 ISCADD crst, blkI, tidX, 7; + +// k = blockE*128 + tid +08:-:-:-:1 ISCADD k, blkE, tidX, 7; +--:-:-:-:1 IADD k, k, param_offset_K; + +--:-:-:-:1 MOV loopN, param_N; + + + +NEXT_P: + +01:-:4:-:1 S2R tidYY, SR_TID.X; +--:-:5:-:1 LDS mm, [addr_m]; + + +--:-:6:-:1 LDS q, [addr_q]; + +// c = crst / RST +// rst = crst % RST +--:-:-:-:1 XMAD.LO2C c, crst, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c, c, param_shift_RST; +--:-:-:-:1 XMAD rst, c, param_RST, RZ; +--:-:-:-:1 IADD rst, -rst, crst; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// y = p * u - pad_h + (r * dil_h) +// z = m * w - pad_d + (t * dil_d) +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +10:-:-:-:1 XMAD mt, mm, param_str_d, RZ; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 IADD y, y, -param_pad_h; +--:-:-:-:1 IADD z, z, -param_pad_d; +// e = k*MPQN + m*PQN + p*QN + tidYY +08:-:-:-:1 LOP.AND tidYY, tidYY, 1; +--:-:-:-:1 SHL tidYY, tidYY, 2; +--:-:-:-:1 XMAD.LO2C e, p, param_QN, tidYY; +--:-:-:-:1 XMAD.LO2C e, mm, param_PQN, e; +--:-:-:-:1 XMAD.LO2C e, k, param_MPQN, e; +// i = c*DHWN + z*HWN + y*WN + tidYY +--:-:-:-:1 XMAD.LO2C i, y, param_WN, tidYY; +--:-:-:-:1 XMAD.LO2C i, z, param_HWN, i; +--:-:-:-:1 XMAD.LO2C i, c, param_DHWN, i; +// bounds_yz = y < 0 || y > H || z < 0 || z > D ? -1 : 0 +--:-:-:-:1 ISET.LT.AND y0, y, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH, y, param_H, PT; +--:-:-:-:1 ISET.LT.AND z0, z, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD, z, param_D, PT; +--:-:-:-:1 LOP.OR bounds_yz, y0, yH; +--:-:-:-:1 LOP3.LUT bounds_yz, bounds_yz, z0, zD, 0xfe; +// doLoadCRST = crst < CRST && bounds_yz == 0 +--:-:-:-:1 ISETP.LT.AND P4, PT, crst, param_CRST, PT; +--:-:-:-:1 ISETP.EQ.AND P4, PT, bounds_yz, RZ, P4; +// p += grid_P +--:-:-:-:1 IADD p, p, param_grid_P; + +--:-:-:-:1 ISETP.LT.AND P6, PT, p, param_P, PT; + + +NEXT_Q: + + +// Zigzag q but only if grid_P < P +--:-:-:-:1 LOP.AND.NZ P1, RZ, p, 1; +--:-:-:-:1 MOV Q, param_grid_P; +--:-:-:-:1 ISETP.LT.AND P1, PT, Q, param_P, P1; +--:-:-:-:1 MOV Q, -1; +20:-:-:-:1 @P1 IADD3 Q, -q, param_Q, Q; +--:-:-:-:1 @!P1 MOV Q, q; +// k < K +--:-:-:-:1 ISETP.LT.AND P3, PT, k, param_K, PT; +// qs = q * v - pad_w +// x = qs + (s dil_w) +--:-:-:-:1 XMAD qs, Q, param_str_w, RZ; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 IADD x, x, -param_pad_w; +// bounds_x = x < 0 || x > W ? -1 : 0 +--:-:-:-:1 ISET.LT.AND x0, x, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW, x, param_W, PT; +--:-:-:-:1 LOP.OR bounds_x, x0, xW; +// doLoad = crst < CRST && bounds_yz == 0 && bounds_x == 0 +--:-:-:-:1 ISETP.EQ.AND P2, PT, bounds_x, RZ, P4; +// trackI = I + i + x*N +--:-:-:-:1 XMAD ti, x, param_N, i; +--:-:-:-:1 LEA trackI0.CC, ti, param_I[0], 2; +--:-:-:-:1 LEA.HI.X trackI1, ti, param_I[1], RZ, 2; +// trackE = E + e + q*N +--:-:-:-:1 XMAD te, Q, param_N, e; +--:-:-:-:1 LEA trackE0.CC, te, param_E[0], 2; +--:-:-:-:0 LEA.HI.X trackE1, te, param_E[1], RZ, 2; +// q += grid_Q +--:-:-:-:1 IADD q, q, param_grid_Q; +--:-:-:-:1 ISETP.LT.AND P5, PT, q, param_Q, PT; + +--:-:-:-:1 @!P0 IADD loopN, loopN, param_N; + + + +--:-:-:Y:6 @!P0 BRA.U NEXT_PQ; + +--:-:-:-:0 PSETP.AND.AND P0, PT, PT, PT, !PT; + +--:-:1:-:1 @P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>]; +--:-:2:-:1 @P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>]; +--:-:-:-:1 @!P2 LDS.U.128 loadI0, [addr_zero]; +--:-:5:-:1 @!P2 LDS.U.128 loadI4, [addr_zero]; + +--:-:-:-:0 ISETP.LE.AND P1, PT, loopN, 32, PT; + +--:-:3:-:1 @P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>]; +--:-:4:-:1 @P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>]; +--:-:-:-:1 @!P3 LDS.U.128 loadE0, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 loadE4, [addr_zero]; + +11:-:-:-:1 STS [writeS + 4x< 0*128>], loadI0; +--:-:-:-:1 STS [writeS + 4x< 1*128>], loadI1; +--:-:-:-:1 STS [writeS + 4x< 2*128>], loadI2; +--:-:-:-:1 STS [writeS + 4x< 3*128>], loadI3; + +02:-:-:-:1 STS [writeS + 4x< 8*128 + 16>], loadI4; +--:-:-:-:1 STS [writeS + 4x< 9*128 + 16>], loadI5; +--:-:-:-:1 STS [writeS + 4x<10*128 + 16>], loadI6; +--:-:-:-:1 STS [writeS + 4x<11*128 + 16>], loadI7; + +--:-:-:-:1 IADD trackI0.CC, trackI0, 4x<16>; +--:-:-:-:0 PSETP.AND.AND P5, PT, P1, P5, PT; + +24:-:-:-:1 STS [writeS + 4x< 0*128 + szBuf>], loadE0; +--:-:-:-:1 STS [writeS + 4x< 1*128 + szBuf>], loadE1; +--:-:-:-:1 STS [writeS + 4x< 2*128 + szBuf>], loadE2; +--:-:-:-:1 STS [writeS + 4x< 3*128 + szBuf>], loadE3; + +--:-:-:-:0 PSETP.AND.AND P6, PT, P1, P6, PT; + +08:-:-:-:1 STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4; +--:-:-:-:1 STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5; +--:-:-:-:1 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6; +--:1:-:-:1 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7; + +--:-:-:-:1 IADD.X trackI1, trackI1, RZ; + +--:-:-:-:1 IADD trackE0.CC, trackE0, 4x<16>; + +--:-:-:-:1 IADD readEs, readEs, -swapBuf; +--:-:-:-:0 IADD readIs, readIs, -swapBuf; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackE1, trackE1, RZ; + +--:-:2:-:1 @P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>]; +--:5:2:-:1 @P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>]; +--:-:3:-:1 @P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>]; +--:6:3:-:1 @P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>]; + +10:-:-:-:6 @P2 IADD trackI0.CC, trackI0, 4x<16>; +--:-:-:-:1 @P2 IADD.X trackI1, trackI1, RZ; +20:-:-:-:6 @P3 IADD trackE0.CC, trackE0, 4x<16>; +--:-:-:-:0 @P3 IADD.X trackE1, trackE1, RZ; + +--:-:-:Y:5 @P5 BRA.U NEXT_Q; +--:-:-:Y:5 @P6 BRA.U NEXT_P; + +--:-:-:-:2 ISETP.LT.AND P5, PT, q, param_Q, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, p, param_P, PT; + +NEXT_PQ: + +--:-:1:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*128 + 64>]; +--:-:1:-:2 LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + + +// P0 loop N +// P2 bounds I +// P3 bounds E +// P4 bounds yz +// P5 loop Q +// P6 loop P + +//loop = N >= 16 && (N >= 32 || (!p5 && !p6)) + +NEXT_16N: + + + + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD loopN, loopN, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, loopN, 16, PT;\n", + + j4c8 => "02:-:-:-:1 \@P0 STS [writeS + 4x< 0*128>], loadI0;\n", + j4c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 1*128>], loadI1;\n", + j4c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 2*128>], loadI2;\n", + j4c14 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 3*128>], loadI3;\n", + + j5c8 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 8*128 + 16>], loadI4;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 9*128 + 16>], loadI5;\n", + j5c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x<10*128 + 16>], loadI6;\n", + j5c14 => "--:2:-:-:1 \@P0 STS [writeS + 4x<11*128 + 16>], loadI7;\n", + + j5c16 => "--:-:-:-:1 ISETP.GE.AND P2, PT, loopN, 32, P2;\n", + + j5c60 => "02:-:2:-:1 \@P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>];\n", + j5c62 => "--:4:2:-:1 \@P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>];\n", + + j6c16 => "--:-:-:-:1 \@!P2 LDS.U.128 loadI0, [addr_zero];\n", + j7c16 => "--:-:-:-:1 \@!P2 LDS.U.128 loadI4, [addr_zero];\n", + + j10c57 => "08:-:-:-:1 \@P2 IADD trackI0.CC, trackI0, 4x<16>;\n", + j10c62 => "--:-:-:-:1 \@P2 IADD.X trackI1, trackI1, RZ;\n", + + j12c8 => "04:-:-:-:1 \@P0 STS [writeS + 4x< 0*128 + szBuf>], loadE0;\n", + j12c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 1*128 + szBuf>], loadE1;\n", + j12c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 2*128 + szBuf>], loadE2;\n", + j12c14 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 3*128 + szBuf>], loadE3;\n", + + j13c8 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4;\n", + j13c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5;\n", + j13c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;\n", + j13c14 => "--:3:-:-:1 \@P0 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;\n", + + j13c16 => "--:-:-:-:1 ISETP.GE.AND P3, PT, loopN, 32, P3;\n", + + j13c60 => "04:-:3:-:1 \@P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>];\n", + j13c62 => "--:4:3:-:1 \@P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>];\n", + + j14c16 => "--:-:-:-:1 \@!P3 LDS.U.128 loadE0, [addr_zero];\n", + j15c16 => "--:-:-:-:1 \@!P3 LDS.U.128 loadE4, [addr_zero];\n", + + j15c57 => "08:-:-:-:1 \@P3 IADD trackE0.CC, trackE0, 4x<16>;\n", + j15c62 => "--:-:-:-:1 \@P3 IADD.X trackE1, trackE1, RZ;\n", + + j14c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "20:-:-:-:1 \@P0 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j15c24 => "--:-:-:-:1 ISETP.GT.AND P1, PT, loopN, 32, PT;\n", + j15c37 => "--:-:-:-:1 PSETP.AND.OR P1, PT, !P5, !P6, P1;\n", + j15c50 => "--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, PT;\n", + + j15c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_16N;\n" . + "01:-:-:Y:5 \@P5 BRA.U NEXT_Q;\n" . + "--:-:-:Y:5 \@P6 BRA.U NEXT_P;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID2; +--:-:2:-:1 S2R blkI, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.X; +--:-:-:-:5 BRA.U END_CTAID2; +CTAID2: +--:-:2:-:1 S2R blkI, SR_CTAID.X; +--:-:3:-:1 S2R blkE, SR_CTAID.Y; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.Z; +END_CTAID2: + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readEs, readEs, -4x; +--:-:-:-:1 @P0 IADD readIs, readIs, -swapBuf; +--:-:-:-:1 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 128 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 5; + +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +01:-:-:-:1 LOP.AND t128, tid, 128; + +// kk = tid31 | (t128 >> 2); +--:-:-:-:1 SHR.U32 kk, t128, 2; +--:-:-:-:1 LOP.OR kk, tid31, kk; + +// readCs = ((tid96 << 4) | kk) << 2; +--:-:-:-:1 SHL readCs, tid96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, kk; +--:-:-:-:1 SHL readCs, readCs, 2; + +// kk += blkE*128; +04:-:-:-:1 ISCADD kk, blkE, kk, 7; +--:-:-:-:1 IADD kk, kk, param_offset_K; + +// crst = blkI*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 crst00, tid96, 1; +02:-:-:-:1 ISCADD crst00, blkI, crst00, 7; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K4, K, 4; +--:-:-:-:1 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +--:-:-:-:1 VMAD.U16.U16 tf, crst00, K, kk; +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSTK, param_CRSTK; +08:-:-:-:1 XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ; + }; + } + return ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 0x2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 0x2; + +// kk < K +--:-:-:-:1 ISETP.LT.AND P5, PT, kk, param_K, PT; +--:-:-:-:1 IADD kk, kk, 64; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, PT; + +--:-:-:-:1 MOV alpha, param_alpha; + + + +--:-:-:-:6 IADD track04F0.CC, track00F0, K4; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:6 IADD track08F0.CC, track04F0, K4; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:6 IADD track12F0.CC, track08F0, K4; +--:-:-:-:0 IADD.X track12F1, track08F1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD track00F0.CC, track00F0, K60;\n" . + "--:-:-:-:1 IADD crst00, crst00, 60;\n" . + "--:-:-:-:1 IADD.X track00F1, track00F1, RZ;\n" . + "--:-:-:-:5 IADD track04F0.CC, track04F0, K60;\n" . + "--:-:-:-:1 IADD crst04, crst04, 60;\n" . + "--:-:-:-:1 IADD.X track04F1, track04F1, RZ;\n" . + "--:-:-:-:5 IADD track08F0.CC, track08F0, K60;\n" . + "--:-:-:-:1 IADD crst08, crst08, 60;\n" . + "--:-:-:-:1 IADD.X track08F1, track08F1, RZ;\n" . + "--:-:-:-:5 IADD track12F0.CC, track12F0, K60;\n" . + "--:-:-:-:1 IADD crst12, crst12, 60;\n" . + "--:-:-:-:1 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +--:-:-:-:0 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], c4; + +--:-:1:-:1 LDS c0, [readCs + 4x<0*128 + 00>]; +--:-:2:-:1 LDS c2, [readCs + 4x<1*128 + 00>]; +--:-:3:-:1 LDS c4, [readCs + 4x<2*128 + 00>]; +--:-:4:-:a LDS c6, [readCs + 4x<3*128 + 00>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:-:-:-:1 @P0 STG.E.CG [track00F], c0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 STG.E.CG [track04F], c2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 STG.E.CG [track08F], c4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 STG.E.CG [track12F], c6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } + else + { + return q{ +01:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F], c0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F], c2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F], c4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F], c6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } ++] + +--:-:1:-:1 LDS c1, [readCs + 4x<0*128 + 64>]; +--:-:2:-:1 LDS c3, [readCs + 4x<1*128 + 64>]; +--:-:3:-:1 LDS c5, [readCs + 4x<2*128 + 64>]; +--:-:4:-:a LDS c7, [readCs + 4x<3*128 + 64>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:1:-:-:1 @P0 STG.E.CG [track00F + 4x<64>], c1; +02:2:-:-:1 @P1 STG.E.CG [track04F + 4x<64>], c3; +04:3:-:-:1 @P2 STG.E.CG [track08F + 4x<64>], c5; +08:4:-:-:1 @P3 STG.E.CG [track12F + 4x<64>], c7; + }; + } + else + { + return q{ +01:1:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], c1; +02:2:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], c3; +04:3:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], c5; +08:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], c7; + }; + } ++] + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Maxwell/sconv_updat_C128_K64.sass b/Kernel/Convolution/Maxwell/sconv_updat_C128_K64.sass new file mode 100644 index 0000000..26cc64c --- /dev/null +++ b/Kernel/Convolution/Maxwell/sconv_updat_C128_K64.sass @@ -0,0 +1,818 @@ +# Kernel: sconv_updat_C128_K64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2> + szShareI : (128*16 + 32) + szShareE : (64*16 + 32) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_offset_K : c[0x0][0x15c] + param_N : c[0x0][0x160] + param_K : c[0x0][0x164] + param_D : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_WN : c[0x0][0x174] + param_HWN : c[0x0][0x178] + param_DHWN : c[0x0][0x17c] + param_C : c[0x0][0x180] + param_CRST : c[0x0][0x184] + param_RST : c[0x0][0x188] + param_magic_RST : c[0x0][0x18c] + param_shift_RST : c[0x0][0x190] + param_RS : c[0x0][0x194] + param_magic_RS : c[0x0][0x198] + param_shift_RS : c[0x0][0x19c] + param_S : c[0x0][0x1a0] + param_magic_S : c[0x0][0x1a4] + param_shift_S : c[0x0][0x1a8] + param_pad_d : c[0x0][0x1ac] + param_pad_h : c[0x0][0x1b0] + param_pad_w : c[0x0][0x1b4] + param_str_d : c[0x0][0x1b8] + param_str_h : c[0x0][0x1bc] + param_str_w : c[0x0][0x1c0] + param_dil_d : c[0x0][0x1c4] + param_dil_h : c[0x0][0x1c8] + param_dil_w : c[0x0][0x1cc] + param_P : c[0x0][0x1d0] + param_Q : c[0x0][0x1d4] + param_PQ : c[0x0][0x1d8] + param_QN : c[0x0][0x1dc] + param_PQN : c[0x0][0x1e0] + param_MPQN : c[0x0][0x1e4] + param_magic_Q : c[0x0][0x1e8] + param_shift_Q : c[0x0][0x1ec] + param_magic_PQ : c[0x0][0x1f0] + param_shift_PQ : c[0x0][0x1f4] + param_grid_P : c[0x0][0x1f8] + param_grid_Q : c[0x0][0x1fc] + param_grid_PQ : c[0x0][0x200] + param_CRSTK : c[0x0][0x204] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkI, blkE, one + 68-99 ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-72 ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q + 73-99 ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1> + 73-99 ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 100-147 : load0I<00-15>, load1I<00-15>, loadE<00-15> + 148-153 : track0I<0-1>, track1I<0-1>, trackE<0-1> + + 154-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY + 165-167 ~ readIs, readEs, swapBuf + + 68-83 : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID1; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkI, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; +--:-:-:-:5 BRA.U END_CTAID1; +CTAID1: +--:-:2:-:1 S2R blkMPQ, SR_CTAID.Z; +--:-:3:-:1 S2R blkI, SR_CTAID.X; +--:-:4:-:1 S2R blkE, SR_CTAID.Y; +END_CTAID1: + + +// tidX = tid >> 1 +// tidY = (tid & 1) << 2 +// shiftX = (tid & 1) << 4 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tid1, 2; +--:-:-:-:1 SHL shiftX, tid1, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_grid_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_grid_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; +--:-:-:-:1 MOV qq, q; + +// writeIs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 7; +--:-:-:-:1 IADD writeIs, writeIs, shiftX; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +// writeEs = (tidY*64 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeEs, tidY, tidX, 6; +--:-:-:-:1 IADD writeEs, writeEs, shiftX; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x, 2; + +// readIs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readIs, tid, -16; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; +--:-:-:-:1 SHL readIs, readIs, 4; +// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +--:-:-:-:1 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readEs, readEs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + +// crst = blockI*128 + tid +04:-:-:-:1 ISCADD crst0, blkI, tidX, 7; +--:-:-:-:1 IADD crst1, crst0, 64; + +// k = blockE*64 + tid +08:-:-:-:1 ISCADD k, blkE, tidX, 6; +--:-:-:-:1 IADD k, k, param_offset_K; + +--:-:-:-:1 MOV loopN, RZ; + +--:-:-:-:1 PSETP.AND.AND P0, PT, PT, PT, PT; + + +NEXT_PQ: + + +// Zigzag q but only if grid_P < P +--:-:-:-:1 LOP.AND.NZ P1, RZ, p, 1; +--:-:-:-:1 MOV Q, param_grid_P; +--:-:-:-:1 ISETP.LT.AND P1, PT, Q, param_P, P1; +--:-:-:-:1 MOV Q, -1; +--:-:-:-:1 @P1 IADD3 Q, -q, param_Q, Q; +--:-:-:-:1 @!P1 MOV Q, q; +// c = crst / RST +// rst = crst % RST +--:-:-:-:1 XMAD.LO2C c0, crst0, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c0, c0, param_shift_RST; +--:-:-:-:1 XMAD rst0, c0, param_RST, RZ; +--:-:-:-:1 IADD rst0, -rst0, crst0; +--:-:-:-:1 XMAD.LO2C c1, crst1, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c1, c1, param_shift_RST; +--:-:-:-:1 XMAD rst1, c1, param_RST, RZ; +--:-:-:-:1 IADD rst1, -rst1, crst1; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t0, rst0, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t0, t0, param_shift_RS; +--:-:-:-:1 XMAD rs0, t0, param_RS, RZ; +--:-:-:-:1 IADD rs0, -rs0, rst0; +--:-:-:-:1 XMAD.LO2C t1, rst1, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t1, t1, param_shift_RS; +--:-:-:-:1 XMAD rs1, t1, param_RS, RZ; +--:-:-:-:1 IADD rs1, -rs1, rst1; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r0, rs0, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r0, r0, param_shift_S; +--:-:-:-:1 XMAD s0, r0, param_S, RZ; +--:-:-:-:1 IADD s0, -s0, rs0; +--:-:-:-:1 XMAD.LO2C r1, rs1, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r1, r1, param_shift_S; +--:-:-:-:1 XMAD s1, r1, param_S, RZ; +--:-:-:-:1 IADD s1, -s1, rs1; +// z = m * w - pad_d + (t * dil_d) +// y = p * u - pad_h + (r * dil_h) +// x = q * v - pad_w + (s * dil_w) +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, Q, param_str_w, RZ; +--:-:-:-:1 XMAD z1, t1, param_dil_d, mt; +--:-:-:-:1 XMAD y1, r1, param_dil_h, pr; +--:-:-:-:1 XMAD x1, s1, param_dil_w, qs; +--:-:-:-:1 XMAD z0, t0, param_dil_d, mt; +--:-:-:-:1 XMAD y1, r0, param_dil_h, pr; +--:-:-:-:1 XMAD x1, s0, param_str_w, qs; +--:-:-:-:1 IADD z1, z1, -param_pad_d; +--:-:-:-:1 IADD y1, y1, -param_pad_h; +--:-:-:-:1 IADD x1, x1, -param_pad_w; +--:-:-:-:1 IADD z0, z0, -param_pad_d; +--:-:-:-:1 IADD y0, y0, -param_pad_h; +--:-:-:-:1 IADD x0, x0, -param_pad_w; + + +// Split blocks to fit inside of 36 registers + +// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY +--:-:-:-:1 XMAD.LO2C ti0, c0, param_DHWN, tidY; +--:-:-:-:1 XMAD.LO2C ti0, z0, param_HWN, ti0; +--:-:-:-:1 XMAD.LO2C ti0, y0, param_WN, ti0; +--:-:-:-:1 XMAD ti0, x0, param_N, ti0; +--:-:-:-:1 XMAD.LO2C ti1, c1, param_DHWN, tidY; +--:-:-:-:1 XMAD.LO2C ti1, z1, param_HWN, ti1; +--:-:-:-:1 XMAD.LO2C ti1, y1, param_WN, ti1; +--:-:-:-:1 XMAD ti1, x1, param_N, ti1; +--:-:-:-:1 LEA track0I0.CC, ti0, param_I[0], 2; +--:-:-:-:1 LEA.HI.X track0I1, ti0, param_I[1], RZ, 2; +--:-:-:-:1 LEA track1I0.CC, ti1, param_I[0], 2; +--:-:-:-:1 LEA.HI.X track1I1, ti1, param_I[1], RZ, 2; + +// trackE = k*MPQN + m*PQN + p*QN + tidY +--:-:-:-:1 XMAD.LO2C te, k, param_MPQN, tidY; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD te, Q, param_N, te; +--:-:-:-:1 LEA trackE0.CC, te, param_E[0], 2; +--:-:-:-:0 LEA.HI.X trackE1, te, param_E[1], RZ, 2; + +// Bounds check x,y,z,c for each I track. +// If out of bounds, this will set the track address to -1 +--:-:-:-:1 ISET.GE.AND cC0, c0, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd0, z0, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD0, z0, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh0, y0, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH0, y0, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw0, x0, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW0, x0, param_W, PT; +--:-:-:-:1 LOP.OR track0I0, track0I0, cC0; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe; + +--:-:-:-:1 ISET.GE.AND cC1, c1, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd1, z1, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD1, z1, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh1, y1, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH1, y1, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw1, x1, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW1, x1, param_W, PT; +--:-:-:-:1 LOP.OR track1I0, track1I0, cC1; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:1 ISETP.NE.AND P2, PT, track0I0, -1, PT; +--:-:-:-:1 ISETP.NE.AND P3, PT, track1I0, -1, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, k, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:-:1 IADD loopN, loopN, param_N; + + +--:-:-:Y:5 @P0 BRA.U FIRST_LOAD; + +INIT_LOOP: + +--:-:1:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +--:-:1:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +--:-:1:-:2 LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + +NEXT_16N: + + + + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD loopN, loopN, -16;\n", + + # p0 = (N & 16) == 0 + # p1 = N >= 32 && p0 + j0c14 => "--:-:-:-:1 LOP.AND.NZ P0, RZ, loopN, 16;\n", + j0c28 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 32, P0;\n", + + + j1c8 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 0*128 + 0 + 0>], load0I08;\n", + j1c10 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 1*128 + 0 + 0>], load0I09;\n", + j1c12 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 2*128 + 0 + 0>], load0I10;\n", + j1c14 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 3*128 + 0 + 0>], load0I11;\n", + j1c16 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 8*128 + 0 + 16>], load0I12;\n", + j1c18 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 9*128 + 0 + 16>], load0I13;\n", + j1c20 => "--:-:-:-:1 \@P0 STS [writeIs + 4x<10*128 + 0 + 16>], load0I14;\n", + j1c22 => "--:-:-:-:1 \@P0 STS [writeIs + 4x<11*128 + 0 + 16>], load0I15;\n", + + j2c8 => "02:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 + 0 + 0>], load0I00;\n", + j2c10 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 + 0 + 0>], load0I01;\n", + j2c12 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 + 0 + 0>], load0I02;\n", + j2c14 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 + 0 + 0>], load0I03;\n", + j2c16 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 + 0 + 16>], load0I04;\n", + j2c18 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 + 0 + 16>], load0I05;\n", + j2c20 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 + 0 + 16>], load0I06;\n", + j2c22 => "--:2:-:-:1 \@!P0 STS [writeIs + 4x<11*128 + 0 + 16>], load0I07;\n", + + j2c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, track0I0, -1, P1;\n", + j2c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n", + + j3c8 => "02:-:-:-:1 \@P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>];\n", + j3c10 => "--:-:-:-:1 \@P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>];\n", + j3c12 => "--:-:-:-:1 \@P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>];\n", + j3c14 => "--:5:2:-:1 \@P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>];\n", + + j4c8 => "--:-:-:-:1 \@P3 LDS.U.128 load0I00, [addr_zero];\n", + j4c10 => "--:-:-:-:1 \@P3 LDS.U.128 load0I04, [addr_zero];\n", + j5c8 => "--:-:-:-:1 \@P3 LDS.U.128 load0I08, [addr_zero];\n", + j5c10 => "--:-:-:-:1 \@P3 LDS.U.128 load0I12, [addr_zero];\n", + + j5c57 => "10:-:-:-:1 \@P2 IADD track0I0.CC, track0I0, 4x<32>;\n", + j5c62 => "--:-:-:-:1 \@P2 IADD.X track0I1, track0I1, RZ;\n", + + j6c8 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 0*128 + 64 + 0>], load1I08;\n", + j6c10 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 1*128 + 64 + 0>], load1I09;\n", + j6c12 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 2*128 + 64 + 0>], load1I10;\n", + j6c14 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 3*128 + 64 + 0>], load1I11;\n", + j6c16 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I12;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I13;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I14;\n", + j6c22 => "--:-:-:-:1 \@P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I15;\n", + + j7c8 => "04:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 + 64 + 0>], load1I00;\n", + j7c10 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 + 64 + 0>], load1I01;\n", + j7c12 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 + 64 + 0>], load1I02;\n", + j7c14 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 + 64 + 0>], load1I03;\n", + j7c16 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04;\n", + j7c18 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05;\n", + j7c20 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I06;\n", + j7c22 => "--:3:-:-:1 \@!P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I07;\n", + + j7c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, track1I0, -1, P1;\n", + j7c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n", + + j8c8 => "04:-:-:-:1 \@P2 LDG.E.CI.128 load1I00, [track1I + 4x< 0>];\n", + j8c10 => "--:-:-:-:1 \@P2 LDG.E.CI.128 load1I04, [track1I + 4x< 8>];\n", + j8c12 => "--:-:-:-:1 \@P2 LDG.E.CI.128 load1I08, [track1I + 4x<16>];\n", + j8c14 => "--:5:3:-:1 \@P2 LDG.E.CI.128 load1I12, [track1I + 4x<24>];\n", + + j9c8 => "--:-:-:-:1 \@P3 LDS.U.128 load1I00, [addr_zero];\n", + j9c10 => "--:-:-:-:1 \@P3 LDS.U.128 load1I04, [addr_zero];\n", + j10c8 => "--:-:-:-:1 \@P3 LDS.U.128 load1I08, [addr_zero];\n", + j10c10 => "--:-:-:-:1 \@P3 LDS.U.128 load1I12, [addr_zero];\n", + + j10c57 => "10:-:-:-:1 \@P2 IADD track1I0.CC, track1I0, 4x<32>;\n", + j10c62 => "--:-:-:-:1 \@P2 IADD.X track1I1, track1I1, RZ;\n", + + + j11c8 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 0*64 + 0>], loadE08;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 1*64 + 0>], loadE09;\n", + j11c12 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 2*64 + 0>], loadE10;\n", + j11c14 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 3*64 + 0>], loadE11;\n", + j11c16 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 8*64 + 16>], loadE12;\n", + j11c18 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 9*64 + 16>], loadE13;\n", + j11c20 => "--:-:-:-:1 \@P0 STS [writeEs + 4x<10*64 + 16>], loadE14;\n", + j11c22 => "--:-:-:-:1 \@P0 STS [writeEs + 4x<11*64 + 16>], loadE15;\n", + + j12c8 => "08:-:-:-:1 \@!P0 STS [writeEs + 4x< 0*64 + 0>], loadE00;\n", + j12c10 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 1*64 + 0>], loadE01;\n", + j12c12 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 2*64 + 0>], loadE02;\n", + j12c14 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 3*64 + 0>], loadE03;\n", + j12c16 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 8*64 + 16>], loadE04;\n", + j12c18 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 9*64 + 16>], loadE05;\n", + j12c20 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x<10*64 + 16>], loadE06;\n", + j12c22 => "--:4:-:-:1 \@!P0 STS [writeEs + 4x<11*64 + 16>], loadE07;\n", + + j12c24 => "--:-:-:-:1 ISETP.LT.AND P2, PT, k, param_K, P1;\n", + + j13c8 => "08:-:-:-:1 \@P2 LDG.E.CI.128 loadE00, [trackE + 4x< 0>];\n", + j13c10 => "--:-:-:-:1 \@P2 LDG.E.CI.128 loadE04, [trackE + 4x< 8>];\n", + j13c12 => "--:-:-:-:1 \@P2 LDG.E.CI.128 loadE08, [trackE + 4x<16>];\n", + j13c14 => "--:5:4:-:1 \@P2 LDG.E.CI.128 loadE12, [trackE + 4x<24>];\n", + + j15c57 => "10:-:-:-:1 \@P2 IADD trackE0.CC, trackE0, 4x<32>;\n", + j15c62 => "--:-:-:-:1 \@P2 IADD.X trackE1, trackE1, RZ;\n", + + # p0 = N >= 16 and not (N == 32 and (p or q)) + j14c8 => "--:-:-:-:1 ISETP.EQ.AND P0, PT, loopN, 32, PT;\n", + j14c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 16, PT;\n", + j14c22 => "--:-:-:-:1 PSETP.OR.AND P0, PT, P5, P6, P0;\n", + j14c35 => "--:-:-:-:1 PSETP.AND.AND P0, PT, !P0, P1, PT;\n", + + j14c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "20:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j15c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_16N;\n" . + "--:-:-:-:0 \@P5 IADD q, q, param_grid_Q;\n" . + "01:-:-:Y:5 \@P5 BRA.U NEXT_PQ;\n" . + "--:-:-:-:1 \@P6 MOV q, qq;\n" . + "--:-:-:-:0 \@P6 IADD p, p, param_grid_P;\n" . + "--:-:-:Y:5 \@P6 BRA.U NEXT_PQ;\n" . + "--:-:-:Y:5 BRA.U FINISH;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +FIRST_LOAD: + +--:-:-:-:0 PSETP.AND.AND P0, PT, PT, PT, !PT; + +--:-:-:-:1 @P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>]; +--:-:-:-:1 @P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>]; +--:-:-:-:1 @P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>]; +--:-:1:-:1 @P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>]; +--:-:-:-:1 @!P2 LDS.U.128 load0I00, [addr_zero]; +--:-:-:-:1 @!P2 LDS.U.128 load0I04, [addr_zero]; +--:-:-:-:1 @!P2 LDS.U.128 load0I08, [addr_zero]; +--:-:4:-:1 @!P2 LDS.U.128 load0I12, [addr_zero]; + +// p1 = N == 32 and (p or q) +--:-:-:-:0 ISETP.EQ.AND P1, PT, loopN, 32, PT; + +--:-:-:-:1 @P3 LDG.E.CI.128 load1I00, [track1I + 4x< 0>]; +--:-:-:-:1 @P3 LDG.E.CI.128 load1I04, [track1I + 4x< 8>]; +--:-:-:-:1 @P3 LDG.E.CI.128 load1I08, [track1I + 4x<16>]; +--:-:2:-:1 @P3 LDG.E.CI.128 load1I12, [track1I + 4x<24>]; +--:-:-:-:1 @!P3 LDS.U.128 load1I00, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.128 load1I04, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.128 load1I08, [addr_zero]; +--:-:5:-:1 @!P3 LDS.U.128 load1I12, [addr_zero]; + +--:-:-:-:1 @P4 LDG.E.CI.128 loadE00, [trackE + 4x< 0>]; +--:-:-:-:1 @P4 LDG.E.CI.128 loadE04, [trackE + 4x< 8>]; +--:-:-:-:1 @P4 LDG.E.CI.128 loadE08, [trackE + 4x<16>]; +--:-:3:-:1 @P4 LDG.E.CI.128 loadE12, [trackE + 4x<24>]; +--:-:-:-:1 @!P4 LDS.U.128 loadE00, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 loadE04, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 loadE08, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 loadE12, [addr_zero]; + +--:-:-:-:0 PSETP.OR.AND P1, PT, P5, P6, P1; + +09:-:-:-:1 STS [writeIs + 4x< 0*128 + 0 + 0>], load0I00; +--:-:-:-:1 STS [writeIs + 4x< 1*128 + 0 + 0>], load0I01; +--:-:-:-:1 STS [writeIs + 4x< 2*128 + 0 + 0>], load0I02; +--:-:-:-:1 STS [writeIs + 4x< 3*128 + 0 + 0>], load0I03; +--:-:-:-:1 STS [writeIs + 4x< 8*128 + 0 + 16>], load0I04; +--:-:-:-:1 STS [writeIs + 4x< 9*128 + 0 + 16>], load0I05; +--:-:-:-:1 STS [writeIs + 4x<10*128 + 0 + 16>], load0I06; +--:-:-:-:1 STS [writeIs + 4x<11*128 + 0 + 16>], load0I07; + +--:-:-:-:6 @P2 IADD track0I0.CC, track0I0, 4x<32>; +--:-:-:-:0 @P2 IADD.X track0I1, track0I1, RZ; + +12:-:-:-:1 STS [writeIs + 4x< 0*128 + 64 + 0>], load1I00; +--:-:-:-:1 STS [writeIs + 4x< 1*128 + 64 + 0>], load1I01; +--:-:-:-:1 STS [writeIs + 4x< 2*128 + 64 + 0>], load1I02; +--:-:-:-:1 STS [writeIs + 4x< 3*128 + 64 + 0>], load1I03; +--:-:-:-:1 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04; +--:-:-:-:1 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05; +--:-:-:-:1 STS [writeIs + 4x<10*128 + 64 + 16>], load1I06; +--:-:-:-:1 STS [writeIs + 4x<11*128 + 64 + 16>], load1I07; + +--:-:-:-:3 @P3 IADD track1I0.CC, track1I0, 4x<32>; +--:-:-:-:2 PSETP.AND.AND P5, PT, P5, P1, PT; +--:-:-:-:1 PSETP.AND.AND P6, PT, P6, P1, PT; +--:-:-:-:0 @P3 IADD.X track1I1, track1I1, RZ; + +24:-:-:-:1 STS [writeEs + 4x< 0*64 + 0>], loadE00; +--:-:-:-:1 STS [writeEs + 4x< 1*64 + 0>], loadE01; +--:-:-:-:1 STS [writeEs + 4x< 2*64 + 0>], loadE02; +--:-:-:-:1 STS [writeEs + 4x< 3*64 + 0>], loadE03; +--:-:-:-:1 STS [writeEs + 4x< 8*64 + 16>], loadE04; +--:-:-:-:1 STS [writeEs + 4x< 9*64 + 16>], loadE05; +--:-:-:-:1 STS [writeEs + 4x<10*64 + 16>], loadE06; +--:1:-:-:1 STS [writeEs + 4x<11*64 + 16>], loadE07; + +--:-:-:-:6 @P4 IADD trackE0.CC, trackE0, 4x<32>; +--:-:-:-:1 @P4 IADD.X trackE1, trackE1, RZ; + +--:-:-:-:1 IADD readEs, readEs, -swapBuf; +--:-:-:-:0 IADD readIs, readIs, -swapBuf; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:0 @P5 IADD q, q, param_grid_Q; +--:-:-:Y:5 @P5 BRA.U NEXT_PQ; +--:-:-:-:0 @P6 IADD p, p, param_grid_P; +--:-:-:Y:5 @P6 BRA.U NEXT_PQ; + +--:-:-:-:2 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:Y:5 BRA.U INIT_LOOP; + + +FINISH: + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID2; +--:-:2:-:1 S2R blkI, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.X; +--:-:-:-:5 BRA.U END_CTAID2; +CTAID2: +--:-:2:-:1 S2R blkI, SR_CTAID.X; +--:-:3:-:1 S2R blkE, SR_CTAID.Y; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.Z; +END_CTAID2: + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readEs, readEs, -4x; +--:-:-:-:1 @P0 IADD readIs, readIs, -swapBuf; +--:-:-:-:1 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 64 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 4; + + +// readCs = ((tid & 96) << 3) | (tid & 31) +01:-:-:-:1 LOP.AND tid31, tid, 31; +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + + +// kk = blkE*64 + tid31; +04:-:-:-:1 ISCADD kk, blkE, tid31, 6; +--:-:-:-:1 IADD kk, kk, param_offset_K; + + +// crst = blkI*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 crst00, tid96, 1; +02:-:-:-:1 ISCADD crst00, blkI, crst00, 7; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K4, K, 4; +--:-:-:-:1 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +--:-:-:-:1 VMAD.U16.U16 tf, crst00, K, kk; +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSTK, param_CRSTK; +08:-:-:-:1 XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ; + }; + } + return ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 0x2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 0x2; + +--:-:-:-:1 MOV alpha, param_alpha; + +// kk < K +--:-:-:-:1 ISETP.LT.AND P5, PT, kk, param_K, PT; +--:-:-:-:1 IADD kk, kk, 32; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, PT; + + + +--:-:-:-:6 IADD track04F0.CC, track00F0, K4; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:6 IADD track08F0.CC, track04F0, K4; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:6 IADD track12F0.CC, track08F0, K4; +--:-:-:-:1 IADD.X track12F1, track08F1, RZ; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD track00F0.CC, track00F0, K60;\n" . + "--:-:-:-:1 IADD crst00, crst00, 60;\n" . + "--:-:-:-:1 IADD.X track00F1, track00F1, RZ;\n" . + "--:-:-:-:5 IADD track04F0.CC, track04F0, K60;\n" . + "--:-:-:-:1 IADD crst04, crst04, 60;\n" . + "--:-:-:-:1 IADD.X track04F1, track04F1, RZ;\n" . + "--:-:-:-:5 IADD track08F0.CC, track08F0, K60;\n" . + "--:-:-:-:1 IADD crst08, crst08, 60;\n" . + "--:-:-:-:1 IADD.X track08F1, track08F1, RZ;\n" . + "--:-:-:-:5 IADD track12F0.CC, track12F0, K60;\n" . + "--:-:-:-:1 IADD crst12, crst12, 60;\n" . + "--:-:-:-:1 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL f0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL f1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL f2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL f3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL f4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL f5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL f6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL f7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +--:-:-:-:0 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], f0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], f4; + +--:-:1:-:1 LDS f0, [readCs + 4x<0*64 + 00>]; +--:-:2:-:1 LDS f2, [readCs + 4x<1*64 + 00>]; +--:-:3:-:1 LDS f4, [readCs + 4x<2*64 + 00>]; +--:-:4:-:1 LDS f6, [readCs + 4x<3*64 + 00>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:-:-:-:1 @P0 STG.E.CG [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 STG.E.CG [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 STG.E.CG [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 STG.E.CG [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } + else + { + return q{ +01:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } ++] + +--:-:1:-:1 LDS f1, [readCs + 4x<0*64 + 32>]; +--:-:2:-:1 LDS f3, [readCs + 4x<1*64 + 32>]; +--:-:3:-:1 LDS f5, [readCs + 4x<2*64 + 32>]; +--:-:4:-:1 LDS f7, [readCs + 4x<3*64 + 32>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:1:-:-:1 @P0 STG.E.CG [track00F + 4x<32>], f1; +02:2:-:-:1 @P1 STG.E.CG [track04F + 4x<32>], f3; +04:3:-:-:1 @P2 STG.E.CG [track08F + 4x<32>], f5; +08:4:-:-:1 @P3 STG.E.CG [track12F + 4x<32>], f7; + }; + } + else + { + return q{ +01:1:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1; +02:2:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3; +04:3:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5; +08:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7; + }; + } ++] + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X128_N128.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X128_N128.sass new file mode 100644 index 0000000..8f91aba --- /dev/null +++ b/Kernel/Convolution/Maxwell/sconv_xprop_X128_N128.sass @@ -0,0 +1,233 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 128; + our $shareF = 128; + our $stepI = 32; + our $stepF = 64; +-] + + + + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + szShareF : (128*8) + szShareI : (128*8) + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<128*8*2 + 128*8*2 + 4> + addr_m : 4x<128*8*2 + 128*8*2 + 4> + addr_p : 4x<128*8*2 + 128*8*2 + 5> + addr_q : 4x<128*8*2 + 128*8*2 + 6> + addr_k : 4x<128*8*2 + 128*8*2 + 7> + addr_szLut : 4x<128*8*2 + 128*8*2 + 8> + addr_lut : 4x<128*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne + 72-111 ~ tid1, tid128, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-107 : loadI<0-3>, loadF<0-3> + + 108-111 ~ offsetF, offsetI, offsetFc, offsetIc + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 114-122 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset + 123-127 ~ readFs, readIs, tid, idx_N + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-122 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidX = (tid & 31) << 2 +// tidY = tid >> 5 +--:-:-:-:1 LOP.AND tidX, tid, 31; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 5; + +// trackF += blkF*128 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 7; + +// trackI += blkI*128 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 7; + +// writeS = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 SHL writeS, writeS, 2; + +// readFs = ((tid & 112) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 112; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7) +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 SHR.U32 tid128, tid128, 3; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid128; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.128 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero]; + +12:-:-:-:1 STS.128 [writeS], loadF; +24:1:-:-:1 STS.128 [writeS + 4x], loadI; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.128 loadI, [trackI]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c40 => "02:2:-:-:1 \@P0 STS.128 [writeS], loadF;\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "20:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "02:-:2:-:1 \@P1 LDG.E.CI.128 loadF, [trackF];\n", + + + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x], loadI;\n", + + j6c54 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j6c59 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j6c61 => "04:-:3:-:1 \@P1 LDG.E.128 loadI, [trackI];\n", + + j6c62 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; + + + +// tidOX = (tid & 7) << 2 + (tid & 128) >> 1 +// tidOY = (tid & 127) >> 3 +--:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 LOP.AND tidOX2, tid, 128; +--:-:-:-:1 SHR.U32 tidOX2, tidOX2, 1; +--:-:-:-:1 LOP.OR tidOX, tidOX, tidOX2; +--:-:-:-:1 LOP.AND tidOY, tid, 127; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 LOP.AND readIs, readIs, 0x1ff; +--:-:-:-:1 LOP.AND readFs, readFs, 0x0ff; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +--:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*128 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 7; + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X128_N64.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X128_N64.sass new file mode 100644 index 0000000..d7bd0a1 --- /dev/null +++ b/Kernel/Convolution/Maxwell/sconv_xprop_X128_N64.sass @@ -0,0 +1,246 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 64; + our $shareF = 128; + our $stepI = 32; + our $stepF = 64; +-] + + + + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + szShareF : (128*8) + szShareI : (64*8) + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 128*8*2 + 4> + addr_m : 4x<64*8*2 + 128*8*2 + 4> + addr_p : 4x<64*8*2 + 128*8*2 + 5> + addr_q : 4x<64*8*2 + 128*8*2 + 6> + addr_k : 4x<64*8*2 + 128*8*2 + 7> + addr_szLut : 4x<64*8*2 + 128*8*2 + 8> + addr_lut : 4x<64*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-111 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-111 : loadI<0-3>, loadF<0-7> + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 104-107 ~ offsetF, offsetIc, offsetFc + + 114-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI + 125-127 ~ readFs, readIs, swapBuf + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-124 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidX = (tid & 15) << 2 +// tidY = tid >> 4 +--:-:-:-:1 LOP.AND tidX, tid, 15; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 4; + +// trackF += blkF*128 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 7; + +// trackI += blkI*64 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 6; + +// writeFs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidX, 7; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 6; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = ((tid & -16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = (tid >> 1) & 7 +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>]; +--:-:3:-:1 @P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>]; +--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; +--:-:5:-:1 @!P1 LDS.U.128 loadF4, [addr_zero]; + +--:-:4:-:1 @P1 LDG.E.128 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero]; + +12:-:-:-:1 STS.128 [writeFs + 4x<00>], loadF0; +04:-:-:-:1 STS.128 [writeFs + 4x<64>], loadF4; + +28:1:-:-:1 STS.128 [writeIs], loadI; + +[+ loop_setup() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>]; +--:5:3:-:1 @P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>]; +--:-:4:-:1 @P1 LDG.E.128 loadI, [trackI]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c40 => "02:2:-:-:1 \@P0 STS.128 [writeFs + 4x<00>], loadF0;\n", + + j2c10 => "02:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "20:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "10:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>];\n", + + j4c8 => "04:3:-:-:1 \@P0 STS.128 [writeFs + 4x<64>], loadF4;\n", + + j4c60 => "04:5:3:-:1 \@P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>];\n", + + j6c8 => "08:4:-:-:1 \@P0 STS.128 [writeIs], loadI;\n", + + j6c55 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j6c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j6c62 => "08:5:4:-:1 \@P1 LDG.E.128 loadI, [trackI];\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 2 +// tidOY = tid >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 SHR.U32 tidOY, tid, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 64 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 6; + +// Mul by 4 here expands k stride back out +// k = blkF*128 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 7; + +[+ output_setup(63, 0, 6) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X32_N128.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X32_N128.sass new file mode 100644 index 0000000..568e714 --- /dev/null +++ b/Kernel/Convolution/Maxwell/sconv_xprop_X32_N128.sass @@ -0,0 +1,262 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 128; + our $shareF = 32; + our $stepI = 32; + our $stepF = 16; +-] + + + + + addr_zero : 4x<32*8*2 + 128*8*2 + 0> + szShareF : (32*8) + szShareI : (128*8) + + addr_zero : 4x<32*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<32*8*2 + 128*8*2 + 4> + addr_m : 4x<32*8*2 + 128*8*2 + 4> + addr_p : 4x<32*8*2 + 128*8*2 + 5> + addr_q : 4x<32*8*2 + 128*8*2 + 6> + addr_k : 4x<32*8*2 + 128*8*2 + 7> + addr_szLut : 4x<32*8*2 + 128*8*2 + 8> + addr_lut : 4x<32*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-69 : m, p, q + 64-69 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne + 70-113 ~ tid1, tid32, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 70-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-119 : loadI<00-15>, loadF<0-3> + + 120-121 : sliceI, sliceF + 120-121 : sliceIF<0-1> + + 122-140 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetF, offsetIc, offsetFc + 141-155 ~ readFs, readIs, swapBuf, tid, idx_N + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-140 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] + +[+ get_mpqk() +] + +// tidX = (tid & 7) << 2 +// tidY = tid >> 3 +--:-:-:-:1 LOP.AND tidX, tid, 7; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 3; + +// trackF += blkF*32 + tidX + offset_K +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 5; + +// trackI += blkI*128 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 7; + +// writeFs = (32*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidX, 5; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +// readFs = (((tid & 16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:0 SHL readFs, readFs, 4; + +// readIs = ((tid & 32) >> 1) | ((tid >> 1) & 7) << 4 +--:-:-:-:1 LOP.AND tid32, tid, 32; +--:-:-:-:1 SHR.U32 tid32, tid32, 1; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid32; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, 4x; + + + +[+ load_lut() +] + +--:-:1:-:1 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; + +--:-:2:-:1 @P1 LDG.E.128 loadI00, [trackI + 4x< 0>]; +--:-:3:-:1 @P1 LDG.E.128 loadI04, [trackI + 4x<32>]; +--:-:4:-:1 @P1 LDG.E.128 loadI08, [trackI + 4x<64>]; +--:-:5:-:1 @P1 LDG.E.128 loadI12, [trackI + 4x<96>]; +--:-:-:-:1 @!P1 LDS.U.128 loadI00, [addr_zero]; +--:-:-:-:1 @!P1 LDS.U.128 loadI04, [addr_zero]; +--:-:-:-:1 @!P1 LDS.U.128 loadI08, [addr_zero]; +--:-:6:-:2 @!P1 LDS.U.128 loadI12, [addr_zero]; + +21:-:-:-:1 STS.128 [writeFs], loadF0; + +02:-:-:-:1 STS.128 [writeIs + 4x< 0>], loadI00; +04:-:-:-:1 STS.128 [writeIs + 4x<32>], loadI04; +08:-:-:-:1 STS.128 [writeIs + 4x<64>], loadI08; +10:1:-:-:1 STS.128 [writeIs + 4x<96>], loadI12; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:-:-:1 @P1 LDG.E.128 loadI00, [trackI + 4x< 0>]; +--:-:3:-:1 @P1 LDG.E.128 loadI04, [trackI + 4x<32>]; +--:-:-:-:1 @P1 LDG.E.128 loadI08, [trackI + 4x<64>]; +--:5:4:-:1 @P1 LDG.E.128 loadI12, [trackI + 4x<96>]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c40 => "02:-:-:-:1 \@P0 STS.128 [writeFs], loadF0;\n", + + j1c62 => "--:-:2:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "02:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF];\n", + + j3c8 => "04:-:-:-:1 \@P0 STS.128 [writeIs + 4x< 0>], loadI00;\n", + j3c10 => "--:3:-:-:1 \@P0 STS.128 [writeIs + 4x<32>], loadI04;\n", + + j3c55 => "10:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j3c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j4c8 => "04:-:-:-:1 \@P1 LDG.E.128 loadI00, [trackI + 4x< 0>];\n", + j4c10 => "--:-:3:-:1 \@P1 LDG.E.128 loadI04, [trackI + 4x<32>];\n", + + j6c8 => "08:-:-:-:1 \@P0 STS.128 [writeIs + 4x<64>], loadI08;\n", + j6c10 => "--:4:-:-:1 \@P0 STS.128 [writeIs + 4x<96>], loadI12;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "08:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c8 => "--:-:-:-:1 \@P1 LDG.E.128 loadI08, [trackI + 4x<64>];\n", + j7c10 => "--:5:4:-:1 \@P1 LDG.E.128 loadI12, [trackI + 4x<96>];\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; + + + +// tidOX = (tid & 7) << 2 + (tid & 32) << 1 +// tidOY = (tid & 31) >> 3 +--:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 LOP.AND tidOX2, tid, 32; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 ISCADD tidOX, tidOX2, tidOX, 1; +--:-:-:-:1 LOP.AND tidOY, tid, 31; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +--:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*32 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +--:-:-:-:1 ISCADD k, idx_K, tidOY, 5; + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X64_N128.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X64_N128.sass new file mode 100644 index 0000000..b782b8a --- /dev/null +++ b/Kernel/Convolution/Maxwell/sconv_xprop_X64_N128.sass @@ -0,0 +1,253 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 128; + our $shareF = 64; + our $stepI = 64; + our $stepF = 32; +-] + + + + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + szShareF : (64*8) + szShareI : (128*8) + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 128*8*2 + 4> + addr_m : 4x<64*8*2 + 128*8*2 + 4> + addr_p : 4x<64*8*2 + 128*8*2 + 5> + addr_q : 4x<64*8*2 + 128*8*2 + 6> + addr_k : 4x<64*8*2 + 128*8*2 + 7> + addr_szLut : 4x<64*8*2 + 128*8*2 + 8> + addr_lut : 4x<64*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-111 ~ tid1, tid64, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-111 : loadI<0-7>, loadF<0-3> + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 108-111 ~ offsetF, offsetIc, offsetFc + + 114-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI + 125-127 ~ readFs, readIs, swapBuf + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-124 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidX = (tid & 15) << 2 +// tidY = tid >> 4 +--:-:-:-:1 LOP.AND tidX, tid, 15; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 4; + +// trackF += blkF*64 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 6; + +// trackI += blkI*128 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 7; + +// writeFs = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidX, 6; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = ((tid & 48) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 48; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 64) >> 3) | ((tid >> 1) & 7) +--:-:-:-:1 LOP.AND tid64, tid, 64; +--:-:-:-:1 SHR.U32 tid64, tid64, 3; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid64; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:5:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x<00>]; +--:-:4:-:1 @P1 LDG.E.128 loadI4, [trackI + 4x<64>]; +--:-:-:-:1 @!P1 LDS.U.128 loadI0, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 loadI4, [addr_zero]; + +12:-:-:-:1 STS.128 [writeFs], loadF0; + +24:-:-:-:1 STS.128 [writeIs + 4x<00>], loadI0; +08:1:-:-:1 STS.128 [writeIs + 4x<64>], loadI4; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:3:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x<00>]; +--:5:4:-:1 @P1 LDG.E.128 loadI4, [trackI + 4x<64>]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c40 => "02:2:-:-:1 \@P0 STS.128 [writeFs], loadF0;\n", + + j2c10 => "02:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "20:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF];\n", + + j3c8 => "04:3:-:-:1 \@P0 STS.128 [writeIs + 4x<00>], loadI0;\n", + + j3c55 => "10:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j3c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j4c8 => "04:-:3:-:1 \@P1 LDG.E.128 loadI0, [trackI + 4x<00>];\n", + + j6c8 => "08:4:-:-:1 \@P0 STS.128 [writeIs + 4x<64>], loadI4;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "08:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c8 => "--:5:4:-:1 \@P1 LDG.E.128 loadI4, [trackI + 4x<64>];\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 2 + (tid & 64) >> 1 +// tidOY = (tid & 63) >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 LOP.AND tidOX2, tid, 64; +--:-:-:-:1 SHR.U32 tidOX2, tidOX2, 1; +--:-:-:-:1 LOP.OR tidOX, tidOX, tidOX2; +--:-:-:-:1 LOP.AND tidOY, tid, 63; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*64 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 6; + +[+ output_setup(31, 1, 5) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X64_N64.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X64_N64.sass new file mode 100644 index 0000000..b42fbea --- /dev/null +++ b/Kernel/Convolution/Maxwell/sconv_xprop_X64_N64.sass @@ -0,0 +1,240 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 64; + our $shareF = 64; + our $stepI = 32; + our $stepF = 32; +-] + + + + + addr_zero : 4x<64*8*2 + 64*8*2 + 0> + szShareF : (64*8) + szShareI : (64*8) + + addr_zero : 4x<64*8*2 + 64*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 64*8*2 + 4> + addr_m : 4x<64*8*2 + 64*8*2 + 4> + addr_p : 4x<64*8*2 + 64*8*2 + 5> + addr_q : 4x<64*8*2 + 64*8*2 + 6> + addr_k : 4x<64*8*2 + 64*8*2 + 7> + addr_szLut : 4x<64*8*2 + 64*8*2 + 8> + addr_lut : 4x<64*8*2 + 64*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-66 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-113 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-115 : loadI<0-7>, loadF<0-7> + + 108-113 ~ offsetF, offsetIc, offsetFc + 114-115 : sliceI, sliceF + 114-115 : sliceIF<0-1> + + 116-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI + 126-127 ~ readFs, readIs + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-125 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] + +[+ get_mpqk() +] + +// tidX = (tid & 7) << 2 +// tidY = tid >> 3 +--:-:-:-:1 LOP.AND tidX, tid, 7; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 3; + +// trackF += blkF*64 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 6; + +// trackI += blkI*64 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 6; + +// writeS = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 6; +--:-:-:-:1 SHL writeS, writeS, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:0 SHL readFs, readFs, 4; + +// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:1:-:1 @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>]; +--:-:2:-:1 @P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>]; +--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; +--:-:5:-:2 @!P1 LDS.U.128 loadF4, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x< 0>]; +--:-:4:-:1 @P1 LDG.E.128 loadI4, [trackI + 4x<32>]; +--:-:-:-:1 @!P1 LDS.U.128 loadI0, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 loadI4, [addr_zero]; + +11:-:-:-:1 STS.128 [writeS + 4x<0*64 + 0>], loadF0; +02:-:-:-:1 STS.128 [writeS + 4x<0*64 + 32>], loadF4; + +24:-:-:-:1 STS.128 [writeS + 4x<8*64 + 0>], loadI0; +08:1:-:-:1 STS.128 [writeS + 4x<8*64 + 32>], loadI4; + +[+ loop_setup() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>]; +--:-:3:-:1 @P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>]; +--:-:4:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x< 0>]; +--:-:5:-:1 @P1 LDG.E.128 loadI4, [trackI + 4x<32>]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c37 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*64 + 0>], loadF0;\n", + j1c39 => "04:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*64 + 32>], loadF4;\n", + + j1c62 => "02:-:2:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "02:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];\n", + j2c42 => "--:-:3:-:1 \@P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>];\n", + + j6c8 => "08:-:-:-:1 \@P0 STS.128 [writeS + 4x<8*64 + 0>], loadI0;\n", + j6c10 => "10:4:-:-:1 \@P0 STS.128 [writeS + 4x<8*64 + 32>], loadI4;\n", + + j6c55 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j6c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j6c62 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "08:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n", + + j7c8 => "--:-:4:-:1 \@P1 LDG.E.128 loadI0, [trackI + 4x< 0>];\n", + j7c10 => "--:-:5:-:1 \@P1 LDG.E.128 loadI4, [trackI + 4x<32>];\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 2 +// tidOY = tid >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 SHR.U32 tidOY, tid, 3; + +--:-:-:-:1 LOP.AND readIs, readIs, 0x7ff; +--:-:-:-:1 LOP.AND readFs, readFs, 0x7ff; + +// Div by 4 here collapses k stride +// writeCs = (readKs / 4) * 64 + readNs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 6; + +// Mul by 4 here expands k stride back out +// k = blkF*64 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 6; + +[+ output_setup(63, 0, 6) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/xconv_direct_updat_64x32.sass b/Kernel/Convolution/Maxwell/xconv_direct_updat_64x32.sass new file mode 100644 index 0000000..803487e --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_direct_updat_64x32.sass @@ -0,0 +1,1077 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our ($type, $SN, $D); +our $determ = $D; +our $largeN = !$SN; +our $dtype = $type eq 'h' ? '.U16' : ''; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $vec_size = $type eq 'h' ? '64' : '128'; +our $dtype_shift = $type eq 'h' ? '1' : '2'; +our $dtype_size = $type eq 'h' ? '2' : '4'; +sub dtype { return $dtype; } +sub dtype_shift { return $dtype_shift; } +sub vec_size { return $vec_size; } +sub output_op { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; } +-] + + + + addr_zero : 4x<(32 + 64)*33*2> + szShareI : (64*33) + szShareE : (32*33) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_C : c[0x0][0x15c] + param_D : c[0x0][0x160] + param_H : c[0x0][0x164] + param_W : c[0x0][0x168] + param_N : c[0x0][0x16c] + param_K : c[0x0][0x170] + param_M : c[0x0][0x174] + param_P : c[0x0][0x178] + param_Q : c[0x0][0x17c] + param_str_d : c[0x0][0x180] + param_str_h : c[0x0][0x184] + param_str_w : c[0x0][0x188] + param_pad_d : c[0x0][0x18c] + param_pad_h : c[0x0][0x190] + param_pad_w : c[0x0][0x194] + param_dil_d : c[0x0][0x198] + param_dil_h : c[0x0][0x19c] + param_dil_w : c[0x0][0x1a0] + param_DHWN : c[0x0][0x1a4] + param_HWN : c[0x0][0x1a8] + param_WN : c[0x0][0x1ac] + param_MPQN16p : c[0x0][0x1b0] + param_MPQN : c[0x0][0x1b4] + param_PQN : c[0x0][0x1b8] + param_QN : c[0x0][0x1bc] + param_PQkc : c[0x0][0x1c0] + param_Qkc : c[0x0][0x1c4] + param_kc : c[0x0][0x1c8] + param_c : c[0x0][0x1cc] + param_k : c[0x0][0x1d0] + param_magic_PQkc : c[0x0][0x1d4] + param_shift_PQkc : c[0x0][0x1d8] + param_magic_Qkc : c[0x0][0x1dc] + param_shift_Qkc : c[0x0][0x1e0] + param_magic_kc : c[0x0][0x1e4] + param_shift_kc : c[0x0][0x1e8] + param_magic_c : c[0x0][0x1ec] + param_shift_c : c[0x0][0x1f0] + param_CTRSK : c[0x0][0x1f4] + param_CTRS : c[0x0][0x1f8] + param_TRS : c[0x0][0x1fc] + param_RS : c[0x0][0x200] + param_S : c[0x0][0x204] + param_magic_TRS : c[0x0][0x208] + param_shift_TRS : c[0x0][0x20c] + param_magic_RS : c[0x0][0x210] + param_shift_RS : c[0x0][0x214] + param_magic_S : c[0x0][0x218] + param_shift_S : c[0x0][0x21c] + param_superM : c[0x0][0x220] + param_superP : c[0x0][0x224] + param_superQ : c[0x0][0x228] + param_superN : c[0x0][0x22c] + param_shiftM : c[0x0][0x230] + param_shiftP : c[0x0][0x234] + param_shiftQ : c[0x0][0x238] + param_strideP : c[0x0][0x23c] + param_strideQ : c[0x0][0x240] + param_stridePQ : c[0x0][0x244] + param_gridP : c[0x0][0x248] + param_gridQ : c[0x0][0x24c] + param_loopX : c[0x0][0x250] + param_loopXp : c[0x0][0x254] + param_loopQ : c[0x0][0x258] + param_loopQp : c[0x0][0x25c] + param_loopN : c[0x0][0x260] + param_loopNp : c[0x0][0x264] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 96-119 : I0<0-3>, I1<0-3>, I2<0-3>, I3<0-3>, E0<0-3>, E1<0-3> + 120-131 : track0I<0-1>, track1I<0-1>, track2I<0-1>, track3I<0-1>, track0E<0-1>, track1E<0-1> + + 64-131 ~ tid, idx_MPQkc, idx_PQkc, idx_Qkc, idx_kc, idx_k, idx_c, magic_PQkc, magic_Qkc, neg_PQkc, neg_Qkc, neg_kc, neg_c, div1, div2, div3, tidX, tidX4, tidY, tid1, readEs2, tid32, tid32_2, neg_TRS, neg_RS, neg_S, super_m, m, mt, k, k16, ctrs<0-3>, trs<0-3>, rs<0-3>, c<0-3>, t<0-3>, z<0-3> + + 80-81 : super_p, super_q + 80-81 : pr, qs + 82-95 ~ p, te, pIn, qIn, predEt, ti<0-3>, y<0-3> + 80-95 ~ loopN, N + + 132-167 ~ tid7, q, n, idx_K, idx_C, idx_M, idx_P, start_P, idx_Q, start_Q, writeIs, writeEs, readIs, readEs, swapBuf, writeFs, predI, predE, init, x<0-3>, czOffset<0-3>, r<0-3>, s<0-3>, kmOffset + + 96-103 : track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 104-119 ~ f00_<0-3>, f04_<0-3>, f08_<0-3>, f12_<0-3> + 104-119 ~ Tid, tid_31, tid_32, K, K16, tf, idx_MPQ, xmad_determ + 120-131 ~ alpha, readFs, K1, kk, crst<00|04|08|12> + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQkc, SR_CTAID.X; +--:-:3:-:1 S2R idx_C, SR_CTAID.Y; +--:-:4:-:1 S2R idx_K, SR_CTAID.Z; + + + +--:-:-:-:1 STS.128 [addr_zero], RZ; + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// idx_M = idx_MPQkc / blk_PQkc +--:-:-:-:1 MOV magic_PQkc, param_magic_PQkc; +--:-:-:-:1 ISETP.NE.AND P0, PT, magic_PQkc, 1, PT; +02:-:-:-:1 @P0 XMAD div1, idx_MPQkc, magic_PQkc, RZ; +--:-:-:-:1 @P0 XMAD div2, idx_MPQkc, magic_PQkc.H1, RZ; +--:-:-:-:1 @P0 XMAD div3, idx_MPQkc.H1, magic_PQkc.H1, RZ; +--:-:-:-:1 @P0 XMAD.CHI div1, idx_MPQkc.H1, magic_PQkc, div1; +--:-:-:-:1 @P0 IADD3.RS idx_M, div1, div2, div3; +--:-:-:-:1 @P0 SHR.U32 idx_M, idx_M, param_shift_PQkc; +--:-:-:-:1 @!P0 SHR.U32 idx_M, idx_MPQkc, param_shift_PQkc; + +// idx_PQkc = idx_PQkc % blk_Qkc +--:-:-:-:1 IADD neg_PQkc, RZ, -param_PQkc; +--:-:-:-:1 XMAD.LO2 idx_PQkc, neg_PQkc, idx_M, idx_MPQkc; + +// idx_P = idx_PQkc / blk_Qkc +--:-:-:-:1 MOV magic_Qkc, param_magic_Qkc; +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_Qkc, 1, PT; +--:-:-:-:1 @P1 XMAD div1, idx_PQkc, magic_Qkc, RZ; +--:-:-:-:1 @P1 XMAD div2, idx_PQkc, magic_Qkc.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, idx_PQkc.H1, magic_Qkc.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, idx_PQkc.H1, magic_Qkc, div1; +--:-:-:-:1 @P1 IADD3.RS idx_P, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 idx_P, idx_P, param_shift_Qkc; +--:-:-:-:1 @!P1 SHR.U32 idx_P, idx_PQkc, param_shift_Qkc; + +// idx_Qkc = idx_PQkc % blk_Qkc +--:-:-:-:1 IADD neg_Qkc, RZ, -param_Qkc; +--:-:-:-:1 XMAD.LO2 idx_Qkc, neg_Qkc, idx_P, idx_PQkc; + +// idx_Q = idx_Qkc / kc +--:-:-:-:1 XMAD.LO2C idx_Q, idx_Qkc, param_magic_kc, RZ; +--:-:-:-:1 SHR.U32 idx_Q, idx_Q, param_shift_kc; +// idx_kc = idx_Qkc % kc +--:-:-:-:1 IADD neg_kc, RZ, -param_kc; +--:-:-:-:1 XMAD.S16.U16 idx_kc, neg_kc, idx_Q, idx_Qkc; + +// idx_k = idx_kc / c +--:-:-:-:1 XMAD idx_k, idx_kc, param_magic_c, RZ; +--:-:-:-:1 SHR.U32 idx_k, idx_k, param_shift_c; +// idx_c = idx_kc % c +--:-:-:-:1 IADD neg_c, RZ, -param_c; +--:-:-:-:1 XMAD.S16.U16 idx_c, neg_c, idx_k, idx_kc; + +// idx_C = idx_C * blk_c + idx_c +// idx_K = idx_K * blk_k + idx_k +04:-:-:-:1 XMAD idx_C, idx_C, param_c, idx_c; +08:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +--:-:-:-:1 MOV start_P, idx_P; +--:-:-:-:1 MOV start_Q, idx_Q; + +// tidX = tid >> 3 +// tidY = (tid & 7) << 2 +// shiftX = tidY +01:-:-:-:1 SHR.U32 tidX, tid, 3; +--:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHL tidY, tid7, 2; + +// writeIs = (tidY*64 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 6; +--:-:-:-:1 IADD writeIs, writeIs, tidY; +--:-:-:-:1 SHL writeIs, writeIs, 2; + +// writeEs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeEs, tidY, tidX, 5; +--:-:-:-:1 IADD writeEs, writeEs, tidY; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x, 2; + +// readEs = (((tid >> 1) & 3) << 4 +--:-:-:-:1 BFE.U32 readEs, tid, 0x201; // 2 bits at position 1 + +// readIs = (((tid & 24) >> 2) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readIs, tid, 24; +--:-:-:-:1 SHR.U32 readIs, readIs, 2; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; + +// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5) +// tid32 = tid & -32 +--:-:-:-:1 LOP.AND tid32, tid, -32; + +// readEs2 = readEs + (tid32 >> 2) + (readIs << 2) +--:-:-:-:1 SHR.U32 tid32_2, tid32, 2; +--:-:-:-:1 IADD readEs2, tid32_2, readEs; +--:-:-:-:1 ISCADD readEs2, readIs, readEs2, 2; + +--:-:-:-:1 SHL readIs, readIs, 4; +--:-:-:-:1 SHL readEs, readEs, 4; +--:-:-:-:1 SHL readEs2, readEs2, 4; + +// writeFs = readIs*32*4 + readEs2 +--:-:-:-:1 ISCADD writeFs, readIs, readEs2, 7; + +// Each block of 32 threads works on 8 lines, +// Also shift over each 8 lines by 8 (cumulative) +// readIs += tid32/4 * 64 * 4 + tid32/4 * 4 +// readEs += tid32/4 * 32 * 4 + tid32/4 * 4 + 4x +--:-:-:-:1 ISCADD readIs, tid32, readIs, 6; +--:-:-:-:1 ISCADD readEs, tid32, readEs, 5; +--:-:-:-:1 IADD readIs, readIs, tid32; +--:-:-:-:1 IADD3 readEs, readEs, 4x, tid32; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// Remap ctrs for better L1 cache performance with small N +// Maximize the amount of overlapping data requested within a warp. +// The L1 is partitioned in to 2 groups of 2 warps. +// ctrs = idx_C*64 + tidX*4 +--:-:-:-:1 SHL tidX4, tidX, 2; +--:-:-:-:1 ISCADD ctrs0, idx_C, tidX4, 6; +--:-:-:-:1 IADD ctrs1, ctrs0, 1; +--:-:-:-:1 IADD ctrs2, ctrs0, 2; +--:-:-:-:1 IADD ctrs3, ctrs0, 3; + +// c = ctrs / RST +--:-:-:-:1 XMAD.LO2C c0, ctrs0, param_magic_TRS, RZ; +--:-:-:-:1 XMAD.LO2C c1, ctrs1, param_magic_TRS, RZ; +--:-:-:-:1 XMAD.LO2C c2, ctrs2, param_magic_TRS, RZ; +--:-:-:-:1 XMAD.LO2C c3, ctrs3, param_magic_TRS, RZ; +--:-:-:-:1 SHR.U32 c0, c0, param_shift_TRS; +--:-:-:-:1 SHR.U32 c1, c1, param_shift_TRS; +--:-:-:-:1 SHR.U32 c2, c2, param_shift_TRS; +--:-:-:-:1 SHR.U32 c3, c3, param_shift_TRS; +// trs = ctrs % RST +--:-:-:-:1 IADD neg_TRS, RZ, -param_TRS; +--:-:-:-:1 XMAD.S16.U16 trs0, neg_TRS, c0, ctrs0; +--:-:-:-:1 XMAD.S16.U16 trs1, neg_TRS, c1, ctrs1; +--:-:-:-:1 XMAD.S16.U16 trs2, neg_TRS, c2, ctrs2; +--:-:-:-:1 XMAD.S16.U16 trs3, neg_TRS, c3, ctrs3; + +// t = trs / RS +--:-:-:-:1 XMAD t0, trs0, param_magic_RS, RZ; +--:-:-:-:1 XMAD t1, trs1, param_magic_RS, RZ; +--:-:-:-:1 XMAD t2, trs2, param_magic_RS, RZ; +--:-:-:-:1 XMAD t3, trs3, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t0, t0, param_shift_RS; +--:-:-:-:1 SHR.U32 t1, t1, param_shift_RS; +--:-:-:-:1 SHR.U32 t2, t2, param_shift_RS; +--:-:-:-:1 SHR.U32 t3, t3, param_shift_RS; +// rs = trs % RS +--:-:-:-:1 IADD neg_RS, RZ, -param_RS; +--:-:-:-:1 XMAD.S16.U16 rs0, neg_RS, t0, trs0; +--:-:-:-:1 XMAD.S16.U16 rs1, neg_RS, t1, trs1; +--:-:-:-:1 XMAD.S16.U16 rs2, neg_RS, t2, trs2; +--:-:-:-:1 XMAD.S16.U16 rs3, neg_RS, t3, trs3; + +// r = rs / S +--:-:-:-:1 XMAD r0, rs0, param_magic_S, RZ; +--:-:-:-:1 XMAD r1, rs1, param_magic_S, RZ; +--:-:-:-:1 XMAD r2, rs2, param_magic_S, RZ; +--:-:-:-:1 XMAD r3, rs3, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r0, r0, param_shift_S; +--:-:-:-:1 SHR.U32 r1, r1, param_shift_S; +--:-:-:-:1 SHR.U32 r2, r2, param_shift_S; +--:-:-:-:1 SHR.U32 r3, r3, param_shift_S; +// s = rs % S +--:-:-:-:1 IADD neg_S, RZ, -param_S; +--:-:-:-:1 XMAD.S16.U16 s0, neg_S, r0, rs0; +--:-:-:-:1 XMAD.S16.U16 s1, neg_S, r1, rs1; +--:-:-:-:1 XMAD.S16.U16 s2, neg_S, r2, rs2; +--:-:-:-:1 XMAD.S16.U16 s3, neg_S, r3, rs3; + +--:-:-:-:1 LOP.AND n, tid, param_superN; +--:-:-:-:1 SHL n, n, 2; + +// M,C,K are static coords so compute offsets and predicates once +--:-:-:-:1 SHL m, idx_M, param_shiftM; +--:-:-:-:1 BFE.U32 super_m, tid7, param_superM; +--:-:-:-:1 IADD m, m, super_m; + +// z = m * str_d - pad_d + (t * dil_d) +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; + +--:-:-:-:1 XMAD z0, t0, param_dil_d, mt; +--:-:-:-:1 XMAD z1, t1, param_dil_d, mt; +--:-:-:-:1 XMAD z2, t2, param_dil_d, mt; +--:-:-:-:1 XMAD z3, t3, param_dil_d, mt; +--:-:-:-:1 IADD z0, z0, -param_pad_d; +--:-:-:-:1 IADD z1, z1, -param_pad_d; +--:-:-:-:1 IADD z2, z2, -param_pad_d; +--:-:-:-:1 IADD z3, z3, -param_pad_d; + +// czOffset = c*DHWN + z*HWN +--:-:-:-:1 XMAD.LO2C czOffset0, c0, param_DHWN, RZ; +--:-:-:-:1 XMAD.LO2C czOffset1, c1, param_DHWN, RZ; +--:-:-:-:1 XMAD.LO2C czOffset2, c2, param_DHWN, RZ; +--:-:-:-:1 XMAD.LO2C czOffset3, c3, param_DHWN, RZ; +--:-:-:-:1 XMAD.S16.U16.LO2C czOffset0, z0, param_HWN, czOffset0; +--:-:-:-:1 XMAD.S16.U16.LO2C czOffset1, z1, param_HWN, czOffset1; +--:-:-:-:1 XMAD.S16.U16.LO2C czOffset2, z2, param_HWN, czOffset2; +--:-:-:-:1 XMAD.S16.U16.LO2C czOffset3, z3, param_HWN, czOffset3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, c0, param_C, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, c1, param_C, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, c2, param_C, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, c3, param_C, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, z0, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_D, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_D, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_D, P3; +--:-:-:-:1 ISETP.GE.AND P0, PT, z0, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; +--:-:-:-:1 P2R predI, PR, RZ, 0x0f; +--:-:-:-:1 SHL predI, predI, 8; + +// k = idx_K*32 + tidX +--:-:-:-:1 ISCADD k, idx_K, tidX, 5; + +// kmOffset = k*MPQN + m*PQN +--:-:-:-:1 XMAD.LO2C kmOffset, k, param_MPQN, RZ; +--:-:-:-:1 XMAD.LO2C kmOffset, m, param_PQN, kmOffset; + +--:-:-:-:1 IADD k16, k, 16; +--:-:-:-:1 ISETP.LT.AND P4, PT, m, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, k16, param_K, P4; +--:-:-:-:1 P2R predE, PR, RZ, 0x03; +--:-:-:-:1 SHL predE, predE, 2; + + + +--:-:-:-:5 CAL CALC_OFFSETS; +--:-:-:-:5 CAL DO_LOADS; +--:-:-:-:5 CAL CALC_OFFSETS; + +[+ + our $convert_in; + return $convert_in ? qq{ +02:-:-:-:1 $convert_in I03, I01.H1; +--:-:-:-:1 $convert_in I02, I01.H0; +--:-:-:-:1 $convert_in I01, I00.H1; +--:-:-:-:1 $convert_in I00, I00.H0; + +--:-:-:-:1 $convert_in I13, I11.H1; +--:-:-:-:1 $convert_in I12, I11.H0; +--:-:-:-:1 $convert_in I11, I10.H1; +--:-:2:-:1 $convert_in I10, I10.H0; + +04:-:-:-:1 $convert_in I23, I21.H1; +--:-:-:-:1 $convert_in I22, I21.H0; +--:-:-:-:1 $convert_in I21, I20.H1; +--:-:-:-:1 $convert_in I20, I20.H0; + +--:-:-:-:1 $convert_in I33, I31.H1; +--:-:-:-:1 $convert_in I32, I31.H0; +--:-:-:-:1 $convert_in I31, I30.H1; +--:-:3:-:1 $convert_in I30, I30.H0; + +08:-:-:-:1 $convert_in E03, E01.H1; +--:-:-:-:1 $convert_in E02, E01.H0; +--:-:-:-:1 $convert_in E01, E00.H1; +--:-:4:-:1 $convert_in E00, E00.H0; + +10:-:-:-:1 $convert_in E13, E11.H1; +--:-:-:-:1 $convert_in E12, E11.H0; +--:-:-:-:1 $convert_in E11, E10.H1; +--:-:5:-:1 $convert_in E10, E10.H0; + } : ''; ++] + +02:-:-:-:1 STS [writeIs + 4x<0*64 + 0*16>], I00; +--:-:-:-:1 STS [writeIs + 4x<1*64 + 0*16>], I01; +--:-:-:-:1 STS [writeIs + 4x<2*64 + 0*16>], I02; +--:-:-:-:1 STS [writeIs + 4x<3*64 + 0*16>], I03; + +--:-:-:-:1 STS [writeIs + 4x<0*64 + 1*16>], I10; +--:-:-:-:1 STS [writeIs + 4x<1*64 + 1*16>], I11; +--:-:-:-:1 STS [writeIs + 4x<2*64 + 1*16>], I12; +--:-:-:-:1 STS [writeIs + 4x<3*64 + 1*16>], I13; + +04:-:-:-:1 STS [writeIs + 4x<0*64 + 2*16>], I20; +--:-:-:-:1 STS [writeIs + 4x<1*64 + 2*16>], I21; +--:-:-:-:1 STS [writeIs + 4x<2*64 + 2*16>], I22; +--:-:-:-:1 STS [writeIs + 4x<3*64 + 2*16>], I23; + +--:-:-:-:1 STS [writeIs + 4x<0*64 + 3*16>], I30; +--:-:-:-:1 STS [writeIs + 4x<1*64 + 3*16>], I31; +--:-:-:-:1 STS [writeIs + 4x<2*64 + 3*16>], I32; +--:-:-:-:1 STS [writeIs + 4x<3*64 + 3*16>], I33; + +08:-:-:-:1 STS [writeEs + 4x<0*32 + 0*16>], E00; +--:-:-:-:1 STS [writeEs + 4x<1*32 + 0*16>], E01; +--:-:-:-:1 STS [writeEs + 4x<2*32 + 0*16>], E02; +--:-:-:-:1 STS [writeEs + 4x<3*32 + 0*16>], E03; + +10:-:-:-:1 STS [writeEs + 4x<0*32 + 1*16>], E10; +--:-:-:-:1 STS [writeEs + 4x<1*32 + 1*16>], E11; +--:-:-:-:1 STS [writeEs + 4x<2*32 + 1*16>], E12; +--:-:-:-:1 STS [writeEs + 4x<3*32 + 1*16>], E13; + +// init = bNextY ? 1 : 0 +--:-:-:-:0 SEL init, RZ, 1, !P6; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy4, [readIs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*32 + 16>]; + +--:-:-:-:5 CAL DO_LOADS; + +// init += bNextY ? 1 : 0 +--:-:-:-:0 @P6 IADD init, init, 1; + +--:-:-:-:5 CAL CALC_OFFSETS; +--:-:-:-:5 BRA.U MAIN_LOOP; + +DO_LOADS: + + + +--:-:-:-:1 R2P PR, predI, 0x0f; +--:-:2:-:1 @P0 LDG.E.CI.[+ vec_size() +] I0, [track0I]; +--:-:2:-:1 @P1 LDG.E.CI.[+ vec_size() +] I1, [track1I]; +--:-:3:-:1 @P2 LDG.E.CI.[+ vec_size() +] I2, [track2I]; +--:-:3:-:1 @P3 LDG.E.CI.[+ vec_size() +] I3, [track3I]; +--:-:-:-:1 @!P0 LDS.U.[+ vec_size() +] I0, [addr_zero]; +--:-:-:-:1 @!P1 LDS.U.[+ vec_size() +] I1, [addr_zero]; +--:-:-:-:1 @!P2 LDS.U.[+ vec_size() +] I2, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.[+ vec_size() +] I3, [addr_zero]; + +--:-:-:-:1 R2P PR, predE, 0x03; +--:-:4:-:1 @P0 LDG.E.CI.[+ vec_size() +] E0, [track0E]; +--:6:5:-:1 @P1 LDG.E.CI.[+ vec_size() +] E1, [track1E]; +--:-:-:-:1 @!P0 LDS.U.[+ vec_size() +] E0, [addr_zero]; +--:-:2:-:1 @!P1 LDS.U.[+ vec_size() +] E1, [addr_zero]; + + +// Advance offset/preds +--:-:-:-:1 IADD n, n, param_loopN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +--:-:-:-:1 @!P4 LOP.AND n, tid7, param_superN; +--:-:-:-:1 @!P4 SHL n, n, 2; +--:-:-:-:1 @!P4 IADD idx_Q, idx_Q, param_strideQ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, idx_Q, param_gridQ, PT; + +--:-:-:-:1 @!P5 MOV idx_Q, start_Q; +--:-:-:-:1 @!P5 IADD idx_P, idx_P, param_strideP; + +--:-:-:-:1 ISETP.LT.AND P6, PT, idx_P, param_gridP, PT; +--:-:-:-:0 ISETP.LT.AND P5, PT, idx_Q, param_gridQ, P6; + +--:-:-:-:1 @!P6 MOV predI, RZ; +--:-:-:-:1 @!P6 MOV predE, RZ; + + +--:-:-:-:5 RET; + +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; + +CALC_OFFSETS: + + +// Calc superblock coordinates in m,p,q space +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; + +// Calc this thread's offset within the superblock +--:-:-:-:1 BFE.U32 super_p, tid7, param_superP; +--:-:-:-:1 BFE.U32 super_q, tid7, param_superQ; + +// Combine offsets for final m,p,q coordinate +--:-:-:-:1 IADD p, p, super_p; +--:-:-:-:1 IADD q, q, super_q; + +// y = p * str_h - pad_h + (r * dil_h) +// x = q * str_w - pad_w + (s * dil_w) +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, q, param_str_w, RZ; + +--:-:-:-:1 XMAD y0, r0, param_dil_h, pr; +--:-:-:-:1 XMAD y1, r1, param_dil_h, pr; +--:-:-:-:1 XMAD y2, r2, param_dil_h, pr; +--:-:-:-:1 XMAD y3, r3, param_dil_h, pr; +--:-:-:-:1 IADD y0, y0, -param_pad_h; +--:-:-:-:1 IADD y1, y1, -param_pad_h; +--:-:-:-:1 IADD y2, y2, -param_pad_h; +--:-:-:-:1 IADD y3, y3, -param_pad_h; + +--:-:-:-:1 XMAD x0, s0, param_dil_w, qs; +--:-:-:-:1 XMAD x1, s1, param_dil_w, qs; +--:-:-:-:1 XMAD x2, s2, param_dil_w, qs; +--:-:-:-:1 XMAD x3, s3, param_dil_w, qs; +--:-:-:-:1 IADD x0, x0, -param_pad_w; +--:-:-:-:1 IADD x1, x1, -param_pad_w; +--:-:-:-:1 IADD x2, x2, -param_pad_w; +--:-:-:-:1 IADD x3, x3, -param_pad_w; + +// trackI = c*DHWN + z*HWN + y*WN + x*N + n +--:-:-:-:1 XMAD.S16.U16.LO2C ti0, y0, param_WN, n; +--:-:-:-:1 XMAD.S16.U16.LO2C ti1, y1, param_WN, n; +--:-:-:-:1 XMAD.S16.U16.LO2C ti2, y2, param_WN, n; +--:-:-:-:1 XMAD.S16.U16.LO2C ti3, y3, param_WN, n; +--:-:-:-:1 XMAD.S16.U16 ti0, x0, param_N, ti0; +--:-:-:-:1 XMAD.S16.U16 ti1, x1, param_N, ti1; +--:-:-:-:1 XMAD.S16.U16 ti2, x2, param_N, ti2; +--:-:-:-:1 XMAD.S16.U16 ti3, x3, param_N, ti3; +--:-:-:-:1 IADD ti0, ti0, czOffset0; +--:-:-:-:1 IADD ti1, ti1, czOffset1; +--:-:-:-:1 IADD ti2, ti2, czOffset2; +--:-:-:-:1 IADD ti3, ti3, czOffset3; + +20:-:-:-:1 LEA track0I0.CC, ti0, param_I[0], [+ dtype_shift() +]; +--:-:-:-:1 ISET.LT.AND ti0, ti0, RZ, PT; +--:-:-:-:1 IADD.X track0I1, ti0, param_I[1]; +--:-:-:-:1 LEA track1I0.CC, ti1, param_I[0], [+ dtype_shift() +]; +--:-:-:-:1 ISET.LT.AND ti1, ti1, RZ, PT; +--:-:-:-:1 IADD.X track1I1, ti1, param_I[1]; +--:-:-:-:1 LEA track2I0.CC, ti2, param_I[0], [+ dtype_shift() +]; +--:-:-:-:1 ISET.LT.AND ti2, ti2, RZ, PT; +--:-:-:-:1 IADD.X track2I1, ti2, param_I[1]; +--:-:-:-:1 LEA track3I0.CC, ti3, param_I[0], [+ dtype_shift() +]; +--:-:-:-:1 ISET.LT.AND ti3, ti3, RZ, PT; +--:-:-:-:1 IADD.X track3I1, ti3, param_I[1]; + +--:-:-:-:1 SHR.U32 predI, predI, 8; +--:-:-:-:1 R2P PR, predI, 0x0f; +--:-:-:-:1 SHL predI, predI, 4; + +--:-:-:-:1 ISETP.LT.AND P0, PT, y0, param_H, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_H, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_H, P3; +--:-:-:-:1 ISETP.GE.AND P0, PT, y0, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, y1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, y2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, y3, RZ, P3; +--:-:-:-:1 P2R predI, PR, predI, 0x0f; +--:-:-:-:1 SHL predI, predI, 4; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x0, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, P3; +--:-:-:-:1 ISETP.GE.AND P0, PT, x0, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R predI, PR, predI, 0x0f; + +// trackE = k*MPQN + m*PQN + p*QN + n +--:-:-:-:1 XMAD.LO2C te, p, param_QN, n; +--:-:-:-:1 XMAD te, q, param_N, te; +--:-:-:-:1 IADD te, te, kmOffset; + +--:-:-:-:1 LEA track0E0.CC, te, param_E[0], [+ dtype_shift() +]; +--:-:-:-:1 LEA.HI.X track0E1, te, param_E[1], RZ, [+ dtype_shift() +]; +--:-:-:-:1 IADD track1E0.CC, track0E0, param_MPQN16p; +--:-:-:-:0 IADD.X track1E1, track0E1, RZ; + +--:-:-:-:1 ISET.LT.AND qIn, p, param_P, PT; +--:-:-:-:1 ISET.LT.AND pIn, q, param_Q, PT; +--:-:-:-:1 SHR.U32 predEt, predE, 2; +--:-:-:-:1 LOP3.LUT predEt, predEt, pIn, qIn, 0x80; +--:-:-:-:1 BFI predE, predEt, 0x200, predE; + + +--:-:-:-:5 RET; + + +MAIN_LOOP: +[+ + our ($vec_size, $convert_in, $largeN); + my %insert = ( + + j0c8 => "--:-:-:-:1 R2P PR, predI, 0x0f;\n", + + $convert_in ? ( + j1c5 => "--:-:-:-:1 DEPBAR.LE SB1, 1;\n", + j1c8 => "--:-:-:-:1 $convert_in I03, I01.H1;\n", + j1c10 => "--:-:-:-:1 $convert_in I02, I01.H0;\n", + j1c12 => "--:-:-:-:1 $convert_in I01, I00.H1;\n", + j1c14 => "--:-:6:-:1 $convert_in I00, I00.H0;\n", + + j2c5 => "--:-:-:-:1 DEPBAR.LE SB1, 1;\n", + j2c8 => "--:-:-:-:1 $convert_in I13, I11.H1;\n", + j2c10 => "--:-:-:-:1 $convert_in I12, I11.H0;\n", + j2c12 => "--:-:-:-:1 $convert_in I11, I10.H1;\n", + j2c14 => "--:-:6:-:1 $convert_in I10, I10.H0;\n", + + j3c5 => "--:-:-:-:1 DEPBAR.LE SB2, 1;\n", + j3c8 => "--:-:-:-:1 $convert_in I23, I21.H1;\n", + j3c10 => "--:-:-:-:1 $convert_in I22, I21.H0;\n", + j3c12 => "--:-:-:-:1 $convert_in I21, I20.H1;\n", + j3c14 => "--:-:6:-:1 $convert_in I20, I20.H0;\n", + + j4c5 => "--:-:-:-:1 DEPBAR.LE SB2, 1;\n", + j4c8 => "--:-:-:-:1 $convert_in I33, I31.H1;\n", + j4c10 => "--:-:-:-:1 $convert_in I32, I31.H0;\n", + j4c12 => "--:-:-:-:1 $convert_in I31, I30.H1;\n", + j4c14 => "--:-:6:-:1 $convert_in I30, I30.H0;\n", + + j5c8 => "08:-:-:-:1 $convert_in E03, E01.H1;\n", + j5c10 => "--:-:-:-:1 $convert_in E02, E01.H0;\n", + j5c12 => "--:-:-:-:1 $convert_in E01, E00.H1;\n", + j5c14 => "--:-:4:-:1 $convert_in E00, E00.H0;\n", + + j6c8 => "10:-:-:-:1 $convert_in E13, E11.H1;\n", + j6c10 => "--:-:-:-:1 $convert_in E12, E11.H0;\n", + j6c12 => "--:-:-:-:1 $convert_in E11, E10.H1;\n", + j6c14 => "--:-:5:-:1 $convert_in E10, E10.H0;\n", + ) : ( + j1c27 => "--:-:-:-:1 DEPBAR.LE SB1, 1;\n", + j2c27 => "--:-:-:-:1 DEPBAR.LE SB1, 1;\n", + j2c27 => "--:-:-:-:1 DEPBAR.LE SB2, 1;\n", + j4c27 => "--:-:-:-:1 DEPBAR.LE SB2, 1;\n", + ), + + j1c30 => "20:-:-:-:1 STS [writeIs + 4x<0*64 + 0*16>], I00;\n", + j1c32 => "--:-:-:-:1 STS [writeIs + 4x<1*64 + 0*16>], I01;\n", + j1c34 => "--:-:-:-:1 STS [writeIs + 4x<2*64 + 0*16>], I02;\n", + j1c36 => "--:6:-:-:1 STS [writeIs + 4x<3*64 + 0*16>], I03;\n", + j1c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];\n", + j1c60 => "20:-:2:-:1 \@P0 LDG.E.CI.$vec_size I0, [track0I];\n", + + j2c30 => "20:-:-:-:1 STS [writeIs + 4x<0*64 + 1*16>], I10;\n", + j2c32 => "--:-:-:-:1 STS [writeIs + 4x<1*64 + 1*16>], I11;\n", + j2c34 => "--:-:-:-:1 STS [writeIs + 4x<2*64 + 1*16>], I12;\n", + j2c36 => "--:6:-:-:1 STS [writeIs + 4x<3*64 + 1*16>], I13;\n", + j2c38 => "--:-:-:-:1 \@!P1 LDS.U.$vec_size I1, [addr_zero];\n", + j2c60 => "20:-:2:-:1 \@P1 LDG.E.CI.$vec_size I1, [track1I];\n", + + j3c30 => "20:-:-:-:1 STS [writeIs + 4x<0*64 + 2*16>], I20;\n", + j3c32 => "--:-:-:-:1 STS [writeIs + 4x<1*64 + 2*16>], I21;\n", + j3c34 => "--:-:-:-:1 STS [writeIs + 4x<2*64 + 2*16>], I22;\n", + j3c36 => "--:6:-:-:1 STS [writeIs + 4x<3*64 + 2*16>], I23;\n", + j3c38 => "--:-:-:-:1 \@!P2 LDS.U.$vec_size I2, [addr_zero];\n", + j3c60 => "20:-:3:-:1 \@P2 LDG.E.CI.$vec_size I2, [track2I];\n", + + j4c30 => "20:-:-:-:1 STS [writeIs + 4x<0*64 + 3*16>], I30;\n", + j4c32 => "--:-:-:-:1 STS [writeIs + 4x<1*64 + 3*16>], I31;\n", + j4c34 => "--:-:-:-:1 STS [writeIs + 4x<2*64 + 3*16>], I32;\n", + j4c36 => "--:6:-:-:1 STS [writeIs + 4x<3*64 + 3*16>], I33;\n", + j4c38 => "--:-:-:-:1 \@!P3 LDS.U.$vec_size I3, [addr_zero];\n", + j4c60 => "20:-:3:-:1 \@P3 LDG.E.CI.$vec_size I3, [track3I];\n", + + j5c7 => "--:-:-:-:1 R2P PR, predE, 0x0f;\n", + + j5c30 => "08:-:-:-:1 STS [writeEs + 4x<0*32 + 0*16>], E00;\n", + j5c32 => "--:-:-:-:1 STS [writeEs + 4x<1*32 + 0*16>], E01;\n", + j5c34 => "--:-:-:-:1 STS [writeEs + 4x<2*32 + 0*16>], E02;\n", + j5c36 => "--:4:-:-:1 STS [writeEs + 4x<3*32 + 0*16>], E03;\n", + j5c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size E0, [addr_zero];\n", + j5c60 => "08:-:4:-:1 \@P0 LDG.E.CI.$vec_size E0, [track0E];\n", + + j6c30 => "10:-:-:-:1 STS [writeEs + 4x<0*32 + 1*16>], E10;\n", + j6c32 => "--:-:-:-:1 STS [writeEs + 4x<1*32 + 1*16>], E11;\n", + j6c34 => "--:-:-:-:1 STS [writeEs + 4x<2*32 + 1*16>], E12;\n", + j6c36 => "--:5:-:-:1 STS [writeEs + 4x<3*32 + 1*16>], E13;\n", + j6c38 => "--:-:-:-:1 \@!P1 LDS.U.$vec_size E1, [addr_zero];\n", + j6c60 => "10:6:5:-:1 \@P1 LDG.E.CI.$vec_size E1, [track1E];\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j7c15 => "--:-:-:-:1 PSETP.OR.AND P4, PT, P5, P6, PT;\n", + j7c17 => "--:-:-:-:1 IADD n, n, param_loopN;\n", + j7c27 => "--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, P4;\n", + + $largeN ? ( + j7c30 => "20:-:-:-:1 IADD track0I0.CC, track0I0, param_loopNp;\n", + j7c35 => "--:-:-:-:1 IADD.X track0I1, track0I1, RZ;\n" . + "--:-:-:-:1 IADD track1I0.CC, track1I0, param_loopNp;\n", + j7c40 => "--:-:-:-:1 IADD.X track1I1, track1I1, RZ;\n" . + "--:-:-:-:1 IADD track2I0.CC, track2I0, param_loopNp;\n", + j7c45 => "--:-:-:-:1 IADD.X track2I1, track2I1, RZ;\n" . + "--:-:-:-:1 IADD track3I0.CC, track3I0, param_loopNp;\n", + j7c50 => "--:-:-:-:1 IADD.X track3I1, track3I1, RZ;\n" . + "--:-:-:-:1 IADD track0E0.CC, track0E0, param_loopNp;\n", + j7c55 => "--:-:-:-:1 IADD.X track0E1, track0E1, RZ;\n" . + "--:-:-:-:1 IADD track1E0.CC, track1E0, param_loopNp;\n", + j7c60 => "--:-:-:-:1 IADD.X track1E1, track1E1, RZ;\n", + ) : (), + + j7c63 => "--:-:-:Y:5 \@P4 BRA.U MAIN_LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) & 7; + my $shift = ((($j + 1) & 7) >> 2) << 2; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy0, [readIs + 4x<%d*64 + 00 + %d>];\n", $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 LDS.U.128 j%dEx0, [readEs + 4x<%d*32 + 00 + %d>];\n", $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy4, [readIs + 4x<%d*64 + 32 + %d>];\n", $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 LDS.U.128 j%dEx4, [readEs + 4x<%d*32 + 16 + %d>];\n", $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1; + + my $yield = $c == 25 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] +// Advance x/q offsets+preds + +--:-:-:-:1 IADD x0, x0, param_loopX; +--:-:-:-:1 IADD x1, x1, param_loopX; +--:-:-:-:1 IADD x2, x2, param_loopX; +--:-:-:-:1 IADD x3, x3, param_loopX; +20:-:-:-:1 IADD track0I0.CC, track0I0, param_loopXp; +--:-:-:-:1 IADD.X track0I1, track0I1, RZ; +--:-:-:-:1 IADD track1I0.CC, track1I0, param_loopXp; +--:-:-:-:1 IADD.X track1I1, track1I1, RZ; +--:-:-:-:1 IADD track2I0.CC, track2I0, param_loopXp; +--:-:-:-:1 IADD.X track2I1, track2I1, RZ; +--:-:-:-:1 IADD track3I0.CC, track3I0, param_loopXp; +--:-:-:-:1 IADD.X track3I1, track3I1, RZ; + +--:-:-:-:1 SHR.U32 predI, predI, 4; +--:-:-:-:1 @P6 R2P PR, predI, 0x0f; +--:-:-:-:1 SHL predI, predI, 4; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x0, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, P3; +--:-:-:-:1 ISETP.GE.AND P0, PT, x0, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R predI, PR, predI, 0x0f; + +--:-:-:-:1 IADD q, q, param_loopQ; +--:-:-:-:1 ISETP.LT.AND P4, PT, q, param_Q, PT; +--:-:-:-:1 @!P4 LOP.AND predE, predE, 0xc; + +--:-:-:-:1 IADD track0E0.CC, track0E0, param_loopQp; +--:-:-:-:1 IADD.X track0E1, track0E1, RZ; +--:-:-:-:1 IADD track1E0.CC, track1E0, param_loopQp; + +--:-:-:-:1 IADD idx_Q, idx_Q, param_strideQ; +--:-:-:-:1 ISETP.LT.AND P5, PT, idx_Q, param_gridQ, P6; + +--:-:-:-:1 LOP.AND n, tid7, param_superN; +--:-:-:-:1 SHL n, n, 2; + + +--:-:-:-:0 IADD.X track1E1, track1E1, RZ; +--:-:-:Y:5 @P5 BRA.U MAIN_LOOP; + +// Advance y/p offsets+preds + +--:-:-:-:1 MOV idx_Q, start_Q; +--:-:-:-:1 IADD idx_P, idx_P, param_strideP; + +--:-:-:-:1 PSETP.AND.AND P5, PT, PT, PT, PT; +--:-:-:Y:d ISETP.LT.AND P6, PT, idx_P, param_gridP, PT; + +--:-:-:Y:5 @!P6 BRA.U FINISH_LOOP; +--:-:-:-:5 CAL CALC_OFFSETS; +--:-:-:Y:5 @P6 BRA.U MAIN_LOOP; + +// Set n to loop remaining times +FINISH_LOOP: +--:-:-:-:1 LOP.AND.NZ P5, RZ, init, 3; +--:-:-:-:1 MOV predI, RZ; +--:-:-:-:1 MOV predE, RZ; +--:-:-:-:1 MOV loopN, param_loopN; +--:-:-:Y:8 MOV N, param_N; +--:-:-:-:1 VMAD.U16.U16 n, -init, loopN, N; +--:-:-:-:0 MOV init, RZ; +01:-:-:Y:5 @P5 BRA.U MAIN_LOOP; + + +--:-:1:-:2 S2R Tid, SR_TID.X; + +01:-:-:-:1 SHR.U32 tid_32, Tid, 5; +--:-:-:-:1 LOP.AND tid_31, Tid, 31; + +// readFs = (tid_32 << 7 + tid_31) << 2 +--:-:-:-:1 ISCADD readFs, tid_32, tid_31, 7; +--:-:-:-:1 SHL readFs, readFs, 2; + +// kk = idx_K*32 + tid31; +--:-:-:-:1 ISCADD kk, idx_K, tid_31, 5; +// kk < K +--:-:-:-:1 ISETP.LT.AND P4, PT, kk, param_K, PT; + +// crst = idx_C*64 + tid_32*4 +--:-:-:-:1 SHL tid_32, tid_32, 2; +--:-:-:-:1 ISCADD crst00, idx_C, tid_32, 6; +--:-:-:-:1 IADD crst04, crst00, 16; +--:-:-:-:1 IADD crst08, crst00, 32; +--:-:-:-:1 IADD crst12, crst00, 48; + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K16, K, 6; + +--:-:-:-:1 MOV alpha, param_alpha; + +// trackF += crst*K + k; +--:-:-:-:1 XMAD.LO2 tf, crst00, K, kk; +[+ + our $determ; + return $determ ? q{ +// idx_MPQ = idx_M * grid_PQ + idx_P * grid_Q + idx_Q +// trackF += idx_MPQ * CRSTK +--:-:-:-:1 XMAD idx_MPQ, start_P, param_strideQ, start_Q; +--:-:-:-:1 XMAD.LO2C idx_MPQ, idx_M, param_stridePQ, idx_MPQ; +--:-:-:-:1 XMAD.LO tf, idx_MPQ, param_CTRSK, tf, xmad_determ; + } : ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 2; +--:-:-:-:1 IADD track04F0.CC, track00F0, K16; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:1 IADD track08F0.CC, track04F0, K16; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:1 IADD track12F0.CC, track08F0, K16; +--:-:-:-:1 IADD.X track12F1, track08F1, RZ; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:1 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:1 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:1 STS.128 [writeFs+4x<0*128 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeFs+4x<0*128 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeFs+4x<1*128 + 00>], shuffle_x0y1; +--:-:-:-:1 STS.128 [writeFs+4x<1*128 + 16>], shuffle_x4y1; +--:-:-:-:1 STS.128 [writeFs+4x<2*128 + 00>], shuffle_x0y2; +--:-:-:-:1 STS.128 [writeFs+4x<2*128 + 16>], shuffle_x4y2; +--:-:-:-:1 STS.128 [writeFs+4x<3*128 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeFs+4x<3*128 + 16>], shuffle_x4y3; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_F; +--:-:-:-:0 IADD readFs, readFs, 4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_F; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:0 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:1 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:1 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:1 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:1 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:1 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:1 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:1 STS.128 [writeFs+4x<0*128 + 00>], shuffle_x0y4; +--:-:-:-:1 STS.128 [writeFs+4x<0*128 + 16>], shuffle_x4y4; +--:-:-:-:1 STS.128 [writeFs+4x<1*128 + 00>], shuffle_x0y5; +--:-:-:-:1 STS.128 [writeFs+4x<1*128 + 16>], shuffle_x4y5; +--:-:-:-:1 STS.128 [writeFs+4x<2*128 + 00>], shuffle_x0y6; +--:-:-:-:1 STS.128 [writeFs+4x<2*128 + 16>], shuffle_x4y6; +--:-:-:-:1 STS.128 [writeFs+4x<3*128 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeFs+4x<3*128 + 16>], shuffle_x4y7; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:0 IADD readFs, readFs, -4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_F; +--:-:-:-:0 IADD readFs, readFs, 4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_F; + +--:-:-:-:5 EXIT; + +STORE_F: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CTRS, P4; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CTRS, P4; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CTRS, P4; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CTRS, P4; // crst12 < CRST && k < K +--:-:-:-:1 IADD crst12, crst12, 1; + +--:-:-:-:1 LDS f00_0, [readFs + 4x< 0*128 + 0*32 + 0*16>]; +--:-:-:-:1 LDS f00_1, [readFs + 4x< 0*128 + 1*32 + 0*16>]; +--:-:-:-:1 LDS f00_2, [readFs + 4x< 0*128 + 2*32 + 0*16>]; +--:-:1:Y:1 LDS f00_3, [readFs + 4x< 0*128 + 3*32 + 0*16>]; +--:-:-:-:1 LDS f04_0, [readFs + 4x< 4*128 + 0*32 + 1*16>]; +--:-:-:-:1 LDS f04_1, [readFs + 4x< 4*128 + 1*32 + 1*16>]; +--:-:-:-:1 LDS f04_2, [readFs + 4x< 4*128 + 2*32 + 1*16>]; +--:-:2:Y:1 LDS f04_3, [readFs + 4x< 4*128 + 3*32 + 1*16>]; +--:-:-:-:1 LDS f08_0, [readFs + 4x< 8*128 + 0*32 + 2*16>]; +--:-:-:-:1 LDS f08_1, [readFs + 4x< 8*128 + 1*32 + 2*16>]; +--:-:-:-:1 LDS f08_2, [readFs + 4x< 8*128 + 2*32 + 2*16>]; +--:-:3:Y:1 LDS f08_3, [readFs + 4x< 8*128 + 3*32 + 2*16>]; +--:-:-:-:1 LDS f12_0, [readFs + 4x<12*128 + 0*32 + 3*16>]; +--:-:-:-:1 LDS f12_1, [readFs + 4x<12*128 + 1*32 + 3*16>]; +--:-:-:-:1 LDS f12_2, [readFs + 4x<12*128 + 2*32 + 3*16>]; +--:-:4:Y:1 LDS f12_3, [readFs + 4x<12*128 + 3*32 + 3*16>]; + + + +01:-:-:-:1 FADD f00_0, f00_0, f00_1; +--:-:-:-:1 FADD f00_2, f00_2, f00_3; +02:-:-:-:1 FADD f04_0, f04_0, f04_1; +--:-:-:-:1 FADD f04_2, f04_2, f04_3; +04:-:-:-:1 FADD f08_0, f08_0, f08_1; +--:-:-:-:1 FADD f08_2, f08_2, f08_3; +08:-:-:-:1 FADD f12_0, f12_0, f12_1; +--:-:-:-:1 FADD f12_2, f12_2, f12_3; + +--:-:-:-:1 FADD f00_0, f00_0, f00_2; +--:-:-:-:2 FADD f04_0, f04_0, f04_2; +--:-:-:-:2 FADD f08_0, f08_0, f08_2; +--:-:-:-:0 FADD f12_0, f12_0, f12_2; + +01:1:-:-:1 @P0 [+ output_op() +] [track00F], f00_0; +02:2:-:-:1 @P1 [+ output_op() +] [track04F], f04_0; +04:3:-:-:1 @P2 [+ output_op() +] [track08F], f08_0; +08:4:-:-:1 @P3 [+ output_op() +] [track12F], f12_0; + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/xconv_direct_xprop_64x32.sass b/Kernel/Convolution/Maxwell/xconv_direct_xprop_64x32.sass new file mode 100644 index 0000000..4720ab8 --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_direct_xprop_64x32.sass @@ -0,0 +1,2477 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our ($type, $SN, $N2, $N1); + our $LN = !($SN || $N2 || $N1); + our $dtype = $type eq 'h' ? 'U16' : '32'; + our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; + our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; + our $vsize = $type eq 'h' ? '64' : '128'; + our $dshift = $type eq 'h' ? '1' : '2'; + our $dsize = $type eq 'h' ? '2' : '4'; + our $slice_scale = $N1 ? 4 : $N2 ? 3 : 2; + our $slice_offset = 1 << $slice_scale; + our $slice_load = 8 << $slice_scale; + sub dtype { return $dtype; } + sub dshift { return $dshift; } + sub vsize { return $vsize; } + our $vsizeI; + if ($type eq 'h') + { $vsizeI = $N1 ? 'U16' : $N2 ? '32' : '64'; } + else + { $vsizeI = $N1 ? '32' : $N2 ? '64' : '128'; } +-] + + + + addr_zero : 4x<(32 + 64)*32*2> + addr_szLut : 4x<(32 + 64)*32*2 + 4> + addr_lut4 : 4x<(32 + 64)*32*2 + 4> + addr_lut : 4x<(32 + 64)*32*2 + 6> + + szShareF : (64*32) + szShareI : (32*32) + + param_Sum[0] : c[0x0][0x140] + param_Sum[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_C : c[0x0][0x174] + param_D : c[0x0][0x178] + param_H : c[0x0][0x17c] + param_W : c[0x0][0x180] + param_N : c[0x0][0x184] + param_K : c[0x0][0x188] + param_M : c[0x0][0x18c] + param_P : c[0x0][0x190] + param_Q : c[0x0][0x194] + param_str_d : c[0x0][0x198] + param_str_h : c[0x0][0x19c] + param_str_w : c[0x0][0x1a0] + param_pad_d : c[0x0][0x1a4] + param_pad_h : c[0x0][0x1a8] + param_pad_w : c[0x0][0x1ac] + param_dil_d : c[0x0][0x1b0] + param_dil_h : c[0x0][0x1b4] + param_dil_w : c[0x0][0x1b8] + param_DHWN : c[0x0][0x1bc] + param_HWN : c[0x0][0x1c0] + param_WN : c[0x0][0x1c4] + param_MPQN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_QN : c[0x0][0x1d0] + param_PQnk : c[0x0][0x1d4] + param_Qnk : c[0x0][0x1d8] + param_nk : c[0x0][0x1dc] + param_n : c[0x0][0x1e0] + param_k : c[0x0][0x1e4] + param_magic_PQnk : c[0x0][0x1e8] + param_shift_PQnk : c[0x0][0x1ec] + param_magic_Qnk : c[0x0][0x1f0] + param_shift_Qnk : c[0x0][0x1f4] + param_magic_nk : c[0x0][0x1f8] + param_shift_nk : c[0x0][0x1fc] + param_magic_k : c[0x0][0x200] + param_shift_k : c[0x0][0x204] + param_Km32 : c[0x0][0x208] + param_K32p : c[0x0][0x20c] + param_TRSK : c[0x0][0x210] + param_TRS : c[0x0][0x214] + param_RS : c[0x0][0x218] + param_S : c[0x0][0x21c] + param_magic_RS : c[0x0][0x220] + param_shift_RS : c[0x0][0x224] + param_magic_S : c[0x0][0x228] + param_shift_S : c[0x0][0x22c] + param_gridP2 : c[0x0][0x230] + param_gridQ : c[0x0][0x234] + param_gridN : c[0x0][0x238] + param_gridQN : c[0x0][0x23c] + param_gridPQN : c[0x0][0x240] + param_gridMPQN : c[0x0][0x244] + param_superM : c[0x0][0x248] + param_superP : c[0x0][0x24c] + param_superQ : c[0x0][0x250] + param_superN : c[0x0][0x254] + param_shiftM : c[0x0][0x258] + param_shiftP : c[0x0][0x25c] + param_shiftQ : c[0x0][0x260] + param_shiftN : c[0x0][0x264] + param_SuperM : c[0x0][0x268] + param_SuperP : c[0x0][0x26c] + param_SuperQ : c[0x0][0x270] + param_SuperN : c[0x0][0x274] + param_magic_str_d : c[0x0][0x278] + param_shift_str_d : c[0x0][0x27c] + param_magic_str_h : c[0x0][0x280] + param_shift_str_h : c[0x0][0x284] + param_magic_str_w : c[0x0][0x288] + param_shift_str_w : c[0x0][0x28c] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Fy<0-7>, j0Ix<0-7> + 80-95 : j1Fy<0-7>, j1Ix<0-7> + + 96-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>, I0<0-3>, I1<0-3> + 120-131 : track0F<0-1>, track1F<0-1>, track2F<0-1>, track3F<0-1>, track0I<0-1>, track1I<0-1> + + 64-83 ~ tidY, m, p, q, negOne, trs, lutStore2, lut_size, warp_count, warp_inc, neg_RS, neg_S, dep_thd_mask, qs, pr, mt, neg_str_w, neg_str_h, neg_str_d + + 84-131 ~ idx_MPQnk, idx_PQnk, idx_Qnk, idx_nk, idx_n, idx_k, magic_PQnk, magic_Qnk, neg_PQnk, neg_Qnk, neg_nk, neg_k, div1, div2, div3, idx_P2, idx_Q2, super_m, super_p, super_q, super_n, tid1, tid2, tid3, tid7, tid8, tid31, tid32, readIs2, tidX, k<0|1|2|3>, sb, warp_mask, mask_shr, shiftSB, maskSB, q<1|2|3> + + 84-131 ~ rs, t, r, s, z, y, x, x<1|2|3>, z_prime, y_prime, x_prime, x_prime<1|2|3>, z_mod, y_mod, x_mod, x_mod<1|2|3>, lutStore, ballot, warp_slices, dep_thd_bits, dep_thd_cnt, tidY1 + +[+ + our ($SN, $N2, $N1); + return $N1 ? q{ + 132-135 : slice0I<0-3> + 168-171 : slice1I<0-3> + 172-183 : track0I<2-3>, track0I<4-5>, track0I<6-7>, track1I<2-3>, track1I<4-5>, track1I<6-7> + 184-185 ~ predsI + + } : $N2 ? q{ + 132-135 : slice0I<0-1>, slice1I<0-1> + 168-171 : track0I<2-3>, track1I<2-3> + + } : $SN ? q{ + 132-135 ~ slice0I, slice1I + + } : q{ + 132-133 : sliceI, sliceF + 132-133 : sliceIF<0-1> + 132-135 : sliceI0, sliceF0, sliceI1, sliceF1 + 132-135 : slice0IF<0-1>, slice1IF<0-1> + }; ++] + + 136-151 ~ posCTRS, endCTRS, endCTRS32, lutSize, lutSizeRcp, lutSizeM1, posCTRSf, channel, lutOffset0, lutOffset1, offsetIc0, offsetIc1, offsetFc0, offsetFc1, partial + 152-167 ~ tid, idx_K, idx_M, idx_P, idx_Q, idx_N, k, n, writeFs, writeIs, readFs, readIs, swapBuf, writeOs, preds, sb_offset + + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + + 64-95 ~ o00_<0-3>, o04_<0-3>, o08_<0-3>, o12_<0-3>, b<00|04|08|12>, x<00|04|08|12>, bsum<00|04|08|12> + 96-131 ~ tid_31, tid_32, alpha, readOs, MPQN16, MPQN4, k<00|04|08|12>, offset, one, M, P, Q, N, super_M, super_P, super_Q, super_N, bsum_offset + 0-7 : Out00_<0-1>, Out04_<0-1>, Out08_<0-1>, Out12_<0-1> + 8-15 : Sum00_<0-1>, Sum04_<0-1>, Sum08_<0-1>, Sum12_<0-1> + 16-31 ~ out<00|04|08|12>, sum<00|04|08|12> + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQnk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +// tidX = (tid & 7) << 2 +// tidY = tid >> 3 << 1 +01:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHL tidX, tid7, 2; +--:-:-:-:1 SHR.U32 tid3, tid, 3; +--:-:-:-:1 SHL tidY, tid3, 1; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// idx_M = idx_MPQnk / blk_PQnk +--:-:-:-:1 MOV magic_PQnk, param_magic_PQnk; +--:-:-:-:1 ISETP.NE.AND P0, PT, magic_PQnk, 1, PT; +02:-:-:-:1 @P0 XMAD div1, idx_MPQnk, magic_PQnk, RZ; +--:-:-:-:1 @P0 XMAD div2, idx_MPQnk, magic_PQnk.H1, RZ; +--:-:-:-:1 @P0 XMAD div3, idx_MPQnk.H1, magic_PQnk.H1, RZ; +--:-:-:-:1 @P0 XMAD.CHI div1, idx_MPQnk.H1, magic_PQnk, div1; +--:-:-:-:1 @P0 IADD3.RS idx_M, div1, div2, div3; +--:-:-:-:1 @P0 SHR.U32 idx_M, idx_M, param_shift_PQnk; +--:-:-:-:1 @!P0 SHR.U32 idx_M, idx_MPQnk, param_shift_PQnk; + +// idx_PQnk = idx_PQnk % blk_Qnk +--:-:-:-:1 IADD neg_PQnk, RZ, -param_PQnk; +--:-:-:-:1 XMAD.LO2 idx_PQnk, neg_PQnk, idx_M, idx_MPQnk; + +// idx_P2 = idx_PQnk / blk_Qnk +--:-:-:-:1 MOV magic_Qnk, param_magic_Qnk; +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_Qnk, 1, PT; +--:-:-:-:1 @P1 XMAD div1, idx_PQnk, magic_Qnk, RZ; +--:-:-:-:1 @P1 XMAD div2, idx_PQnk, magic_Qnk.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, idx_PQnk.H1, magic_Qnk.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk, div1; +--:-:-:-:1 @P1 IADD3.RS idx_P2, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 idx_P2, idx_P2, param_shift_Qnk; +--:-:-:-:1 @!P1 SHR.U32 idx_P2, idx_PQnk, param_shift_Qnk; + +// idx_Qnk = idx_PQnk % blk_Qnk +--:-:-:-:1 IADD neg_Qnk, RZ, -param_Qnk; +--:-:-:-:1 XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk; + +// idx_Q2 = idx_Qnk / nk +--:-:-:-:1 XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ; +--:-:-:-:1 SHR.U32 idx_Q2, idx_Q2, param_shift_nk; +// idx_nk = idx_Qnk % nk +--:-:-:-:1 IADD neg_nk, RZ, -param_nk; +--:-:-:-:1 XMAD.S16.U16 idx_nk, neg_nk, idx_Q2, idx_Qnk; + +// idx_n = idx_nk / k +--:-:-:-:1 XMAD idx_n, idx_nk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_n, idx_n, param_shift_k; +// idx_k = idx_nk % k +--:-:-:-:1 IADD neg_k, RZ, -param_k; +--:-:-:-:1 XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk; + +// idx_N = idx_N * blk_n + idx_n +// idx_K = idx_K * blk_k + idx_k +08:-:-:-:1 XMAD idx_N, idx_N, param_n, idx_n; +04:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +--:-:-:-:1 ISCADD k, idx_K, tidX, 6; + + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// idx_P = idx_P2 * 2 +// idx_Q = idx_Q2 +// if idx_P2 != gridP2: +// idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1) +// idx_Q = idx_Q2 >> 1 +--:-:-:-:1 ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT; +--:-:-:-:1 SHL idx_P, idx_P2, 1; +--:-:-:-:1 @P1 LOP.AND q1, idx_Q2, 1; +--:-:-:-:1 @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P1 LOP.XOR q1, q1, q2; +--:-:-:-:1 @P1 IADD idx_P, idx_P, q1; +--:-:-:-:1 @P1 SHR.U32 idx_Q, idx_Q2, 1; +--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2; + +// Scan backwards on odd rows +// if idx_P2 & 1: +// idx_Q = gridQ - idx_Q - 1 +--:-:-:-:1 LOP.AND.NZ P0, RZ, idx_P2, 1; +--:-:-:-:1 MOV negOne, -1; +--:-:-:-:1 @P0 IADD3 idx_Q, -idx_Q, param_gridQ, negOne; + +// writeFs = (tidY*64 + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidX, 6; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (tidY*32 + tidX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 5; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + + +// readIs = (((tid >> 1) & 3) << 4 +--:-:-:-:1 BFE.U32 readIs, tid, 0x201; // 2 bits at position 1 + +// readFs = (((tid & 24) >> 2) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 24; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; + +// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5) +// tid32 = tid & -32 +--:-:-:-:1 LOP.AND tid32, tid, -32; + +// readIs2 = readIs + (tid32 >> 2) + (readFs << 2) +--:-:-:-:1 SHR.U32 readIs2, tid32, 2; +--:-:-:-:1 IADD readIs2, readIs2, readIs; +--:-:-:-:1 ISCADD readIs2, readFs, readIs2, 2; + +--:-:-:-:1 SHL readFs, readFs, 4; +--:-:-:-:1 SHL readIs, readIs, 4; +--:-:-:-:1 SHL readIs2, readIs2, 4; + +// writeFs = readFs*32*4 + readIs2 +--:-:-:-:1 ISCADD writeOs, readFs, readIs2, 7; + +// Each block of 32 threads works on 8 lines, +// readFs += tid32/4 * 64 * 4 +// readIs += tid32/4 * 32 * 4 + 4x +--:-:-:-:1 ISCADD readFs, tid32, readFs, 6; +--:-:-:-:1 ISCADD readIs, tid32, readIs, 5; +--:-:-:-:1 IADD readIs, readIs, 4x; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +[+ + our $K1; + return $K1 ? q{ +--:-:-:-:1 IADD k0, k, 32; +--:-:-:-:1 IADD k1, k, 33; +--:-:-:-:1 IADD k2, k, 34; +--:-:-:-:1 IADD k3, k, 35; +--:-:-:-:1 ISETP.LT.AND P0, PT, k0, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, param_K, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +--:-:-:-:1 SHL preds, preds, 4; + +--:-:-:-:1 IADD k1, k, 1; +--:-:-:-:1 IADD k2, k, 2; +--:-:-:-:1 IADD k3, k, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, param_K, PT; +--:-:-:-:1 P2R preds, PR, preds, 0x0f; + } : ''; ++] + +[+ + our ($SN, $N2, $N1); + return $N1 ? q{ +--:-:-:-:1 SHL m, idx_M, param_shiftM; +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; + +--:-:-:-:1 BFE.U32 super_m, tid7, param_superM; +--:-:-:-:1 BFE.U32 super_p, tid7, param_superP; +--:-:-:-:1 BFE.U32 super_q, tid7, param_superQ; + +--:-:-:-:1 IADD m, m, super_m; +--:-:-:-:1 IADD p, p, super_p; +--:-:-:-:1 ISCADD q, super_q, q, 2; +--:-:-:-:1 IADD q1, q, 1; +--:-:-:-:1 IADD q2, q, 2; +--:-:-:-:1 IADD q3, q, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, m, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, p, param_P, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, q, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, q1, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, q2, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, q3, param_Q, P4; +--:-:-:-:1 P2R predsI, PR, RZ, 0x0f; + +// warp_count = 16 +// warp_inc = 16 +// trs = tid3 +--:-:-:-:1 MOV warp_count, 16; +--:-:-:-:1 MOV warp_inc, 16; +--:-:-:-:1 MOV trs, tid3; +// compute shared memory super-block offset into the lookup table +// sb_offset = tid7 * TRS * 4 * 4 +--:-:-:-:1 XMAD sb_offset, tid7, param_TRS, RZ; +--:-:-:-:1 SHL sb_offset, sb_offset, 4; + + } : $N2 ? q{ + +--:-:-:-:1 SHL m, idx_M, param_shiftM; +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; + +--:-:-:-:1 BFE.U32 super_m, tid7, param_superM; +--:-:-:-:1 BFE.U32 super_p, tid7, param_superP; +--:-:-:-:1 BFE.U32 super_q, tid7, param_superQ; + +--:-:-:-:1 IADD m, m, super_m; +--:-:-:-:1 IADD p, p, super_p; +--:-:-:-:1 ISCADD q, super_q, q, 1; +--:-:-:-:1 IADD q1, q, 1; + +--:-:-:-:1 ISETP.LT.AND P4, PT, m, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, p, param_P, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, q, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, q1, param_Q, P4; + +// warp_count = 16 +// warp_inc = 16 +// trs = tid3 +--:-:-:-:1 MOV warp_count, 16; +--:-:-:-:1 MOV warp_inc, 16; +--:-:-:-:1 MOV trs, tid3; +// compute shared memory super-block offset into the lookup table +// sb_offset = tid7 * TRS * 4 * 2 +--:-:-:-:1 XMAD sb_offset, tid7, param_TRS, RZ; +--:-:-:-:1 SHL sb_offset, sb_offset, 3; + + } : $SN ? q{ +--:-:-:-:1 SHL m, idx_M, param_shiftM; +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; +--:-:-:-:1 SHL n, idx_N, param_shiftN; + +--:-:-:-:1 BFE.U32 super_m, tid7, param_superM; +--:-:-:-:1 BFE.U32 super_p, tid7, param_superP; +--:-:-:-:1 BFE.U32 super_q, tid7, param_superQ; +--:-:-:-:1 LOP.AND super_n, tid7, param_superN; + +--:-:-:-:1 IADD m, m, super_m; +--:-:-:-:1 IADD p, p, super_p; +--:-:-:-:1 IADD q, q, super_q; +--:-:-:-:1 ISCADD n, super_n, n, 2; + +--:-:-:-:1 ISETP.LT.AND P0, PT, m, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, p, param_P, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, q, param_Q, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, n, param_N, P0; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P2; + +// sb = tid7 >> (shiftN - 2): 0-1,0-3,0-7 +--:-:-:-:1 MOV shiftSB, param_shiftN; +--:-:-:-:1 IADD shiftSB, shiftSB, -2; +--:-:-:-:1 SHR.U32 sb, tid7, shiftSB; +// warp_count = 4 << shiftN: 64,32,16 +--:-:-:-:1 MOV warp_count, 4; +--:-:-:-:1 SHL warp_count, warp_count, param_shiftN; +--:-:-:-:1 MOV warp_inc, warp_count; +// maskSB = (1 << shiftSB) - 1: 3,1,0 +--:-:-:-:1 MOV maskSB, 1; +--:-:-:-:1 SHL maskSB, maskSB, shiftSB; +--:-:-:-:1 IADD maskSB, maskSB, -1; +// trs = tid3 << shiftSB + (tid7 & mask) +--:-:-:-:1 LOP.AND maskSB, tid7, maskSB; +--:-:-:-:1 SHL trs, tid3, shiftSB; +--:-:-:-:1 IADD trs, trs, maskSB; +// compute shared memory super-block offset into the lookup table +// sb_offset = sb * TRS * 4 +--:-:-:-:1 XMAD sb_offset, sb, param_TRS, RZ; +--:-:-:-:1 SHL sb_offset, sb_offset, 2; + + } : q{ +--:-:-:-:1 SHL n, idx_N, 5; +--:-:-:-:1 ISCADD n, tid7, n, 2; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +--:-:-:-:1 MOV trs, tid; +--:-:-:-:1 MOV lutStore2, RZ; +--:-:-:-:1 MOV lut_size, RZ; +--:-:-:-:1 MOV warp_count, 32; +--:-:-:-:1 MOV warp_inc, 32; + +--:-:-:-:1 IADD mask_shr, -tid, 32; +--:-:-:-:1 SHR.U32 dep_thd_mask, negOne, mask_shr; + +--:-:-:-:1 ISETP.GE.AND P6, PT, tid, 32, PT; + + }; ++] +--:-:-:-:1 IADD neg_RS, RZ, -param_RS; +--:-:-:-:1 IADD neg_S, RZ, -param_S; + +[+ + our ($LN, $prop); + my ($m, $p, $q) = $LN ? qw(idx_M idx_P idx_Q) : qw(m p q); + return $prop eq 'f' ? qq{ +// mt = m * str_d - pad_d +// pr = p * str_h - pad_h +// qs = q * str_w - pad_w +--:-:-:-:1 XMAD mt, $m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, $p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, $q, param_str_w, RZ; +--:-:-:-:1 IADD mt, mt, -param_pad_d; +--:-:-:-:1 IADD pr, pr, -param_pad_h; +--:-:-:-:1 IADD qs, qs, -param_pad_w; + } : qq{ +// mt = m - pad_d +// pr = p - pad_h +// qs = q - pad_w +--:-:-:-:1 IADD mt, $m, -param_pad_d; +--:-:-:-:1 IADD pr, $p, -param_pad_h; +--:-:-:-:1 IADD qs, $q, -param_pad_w; + +--:-:-:-:1 IADD neg_str_d, RZ, -param_str_d; +--:-:-:-:1 IADD neg_str_h, RZ, -param_str_h; +--:-:-:-:1 IADD neg_str_w, RZ, -param_str_w; + }; ++] + + +[+ + our $LN; return $LN ? q{ +--:-:-:-:5 @P6 BRA.U END_SETUP; + } : ''; ++] + +LUT_LOOP: + + +// warp synchronous loop while warp_count < RST +--:-:-:-:1 ISETP.LT.AND P6, PT, warp_count, param_TRS, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, trs, param_TRS, PT; + +--:-:-:-:1 IADD warp_count, warp_count, warp_inc; +// t = trs / RS +// rs = trs % RS +--:-:-:-:1 XMAD.U16.U16 t, trs, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD.U16.S16 rs, t, neg_RS, trs; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.U16.U16 r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD.U16.S16 s, r, neg_S, rs; + +[+ + our ($SN, $N2, $N1, $prop); + if ($prop eq 'f') + { + return $N1 ? q{ +// x = qs + (s * dil_w) +// y = pr + (r * dil_h) +// z = mt + (t * dil_d) +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 IADD x1, x, param_str_w; +--:-:-:-:1 IADD x2, x1, param_str_w; +--:-:-:-:1 IADD x3, x2, param_str_w; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y, RZ, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P5; +--:-:-:-:1 @P4 R2P PR, predsI, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, P3; + +// sliceI = z*HWN + y*WN + x +01:-:-:-:1 XMAD.LO2C slice0I0, z, param_HWN, x; +--:-:-:-:1 XMAD.LO2C slice0I0, y, param_WN, slice0I0; +--:-:-:-:1 IADD slice0I1, slice0I0, param_str_w; +--:-:-:-:1 IADD slice0I2, slice0I1, param_str_w; +--:-:-:-:1 IADD slice0I3, slice0I2, param_str_w; + +--:-:-:-:1 @!P0 MOV slice0I0, -1; +--:-:-:-:1 @!P1 MOV slice0I1, -1; +--:-:-:-:1 @!P2 MOV slice0I2, -1; +--:-:-:-:1 @!P3 MOV slice0I3, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 4; +--:-:-:-:1 IADD trs, trs, warp_inc; + +--:1:-:-:1 @P5 STS.128 [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : $N2 ? q{ + +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 IADD x1, x, param_str_w; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y, RZ, P5; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P3; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; + +// sliceI = z*HWN + y*WN + x*2 +01:-:-:-:1 XMAD.LO2C slice0I0, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I0, y, param_WN, slice0I0; +--:-:-:-:1 ISCADD slice0I1, x1, slice0I0, 1; +--:-:-:-:1 ISCADD slice0I0, x, slice0I0, 1; + +--:-:-:-:1 @!P0 MOV slice0I0, -1; +--:-:-:-:1 @!P1 MOV slice0I1, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 3; +--:-:-:-:1 IADD trs, trs, warp_inc; + +--:1:-:-:1 @P5 STS.64 [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : $SN ? q{ + +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P2, PT, x, RZ, P5; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x, param_W, P2; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD.LO2C slice0I, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I, y, param_WN, slice0I; +--:-:-:-:1 XMAD slice0I, x, param_N, slice0I; + + +--:-:-:-:1 @!P0 MOV slice0I, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 2; +--:-:-:-:1 IADD trs, trs, warp_inc; + + +--:1:-:-:1 @P5 STS [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : q{ + +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P2, PT, x, RZ, P5; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x, param_W, P2; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD sliceI, x, param_N, sliceI; +// sliceF = trs * K +--:-:-:-:1 XMAD sliceF, trs, param_K, RZ; + + + +// Get a mask of all valid slices in the warp +--:-:-:-:1 VOTE.ANY ballot, PT, P0; +// Count the total valid slices +--:-:2:-:1 POPC warp_slices, ballot; +// Prepare lutStore for this and next loop +--:-:-:-:1 @P0 MOV lutStore, lutStore2; +02:-:-:-:1 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +--:-:-:-:1 @P0 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +--:-:3:-:1 @P0 POPC dep_thd_cnt, dep_thd_bits; +// use the trs increment to space the barrier sync +--:-:-:-:1 IADD trs, trs, warp_inc; +// Update the lutStore address from this count +04:-:-:-:1 @P0 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +--:1:-:-:1 @P0 STS.64 [lutStore + addr_lut], sliceIF; + +// Keep track of the total size of the lut +--:-:-:-:1 IADD lut_size, lut_size, warp_slices; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +--:1:-:-:2 STS [addr_szLut], lut_size; + }; + } + else # bprop + { + return $N1 ? q{ + +// x_prime = qs + s +// y_prime = pr + r +// z_prime = mt + t +--:-:-:-:1 XMAD z_prime, t, param_dil_d, mt; +--:-:-:-:1 XMAD y_prime, r, param_dil_h, pr; +--:-:-:-:1 XMAD x_prime, s, param_dil_w, qs; +--:-:-:-:1 IADD3 x_prime1, qs, 1, s; +--:-:-:-:1 IADD3 x_prime2, qs, 2, s; +--:-:-:-:1 IADD3 x_prime3, qs, 3, s; + +// z = z_prime / str_d +// z_mod = z_prime % str_d +--:-:-:-:1 XMAD z, z_prime, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z, z, param_shift_str_d; +--:-:-:-:1 XMAD.U16.S16 z_mod, z, neg_str_d, z_prime; +// y = y_prime / str_h +// y_mod = y_prime % str_h +--:-:-:-:1 XMAD y, y_prime, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y, y, param_shift_str_h; +--:-:-:-:1 XMAD.U16.S16 y_mod, y, neg_str_h, y_prime; +// x = x_prime / str_w +// x_mod = x_prime % str_w +--:-:-:-:1 XMAD x, x_prime, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x, x, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod, x, neg_str_w, x_prime; + +--:-:-:-:1 XMAD x1, x_prime1, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x1, x1, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod1, x1, neg_str_w, x_prime1; + +--:-:-:-:1 XMAD x2, x_prime2, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x2, x2, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod2, x2, neg_str_w, x_prime2; + +--:-:-:-:1 XMAD x3, x_prime3, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x3, x3, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod3, x3, neg_str_w, x_prime3; + + +--:-:-:-:1 ISETP.GE.AND P0, PT, z_prime, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y_prime, RZ, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.EQ.AND P0, PT, z_mod, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, y_mod, RZ, P1; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P5; +--:-:-:-:1 @P4 R2P PR, predsI, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 ISETP.GE.AND P0, PT, x_prime, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x_prime1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x_prime2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x_prime3, RZ, P3; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, P3; +--:-:-:-:1 ISETP.EQ.AND P0, PT, x_mod, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, x_mod1, RZ, P1; +--:-:-:-:1 ISETP.EQ.AND P2, PT, x_mod2, RZ, P2; +--:-:-:-:1 ISETP.EQ.AND P3, PT, x_mod3, RZ, P3; + +// sliceI = z*HWN + y*WN + x +01:-:-:-:1 XMAD.LO2C slice0I0, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I0, y, param_WN, slice0I0; +--:-:-:-:1 IADD slice0I1, slice0I0, x1; +--:-:-:-:1 IADD slice0I2, slice0I0, x2; +--:-:-:-:1 IADD slice0I3, slice0I0, x3; +--:-:-:-:1 IADD slice0I0, slice0I0, x; + +--:-:-:-:1 @!P0 MOV slice0I0, -1; +--:-:-:-:1 @!P1 MOV slice0I1, -1; +--:-:-:-:1 @!P2 MOV slice0I2, -1; +--:-:-:-:1 @!P3 MOV slice0I3, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 4; +--:-:-:-:1 IADD trs, trs, warp_inc; + +--:1:-:-:1 @P5 STS.128 [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : $N2 ? q{ + +// x_prime = qs + s +// y_prime = pr + r +// z_prime = mt + t +--:-:-:-:1 XMAD z_prime, t, param_dil_d, mt; +--:-:-:-:1 XMAD y_prime, r, param_dil_h, pr; +--:-:-:-:1 XMAD x_prime, s, param_dil_w, qs; +--:-:-:-:1 IADD3 x_prime1, qs, 1, s; +--:-:-:-:1 IADD3 x_prime2, qs, 2, s; +--:-:-:-:1 IADD3 x_prime3, qs, 3, s; + +// z = z_prime / str_d +// z_mod = z_prime % str_d +--:-:-:-:1 XMAD z, z_prime, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z, z, param_shift_str_d; +--:-:-:-:1 XMAD.U16.S16 z_mod, z, neg_str_d, z_prime; +// y = y_prime / str_h +// y_mod = y_prime % str_h +--:-:-:-:1 XMAD y, y_prime, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y, y, param_shift_str_h; +--:-:-:-:1 XMAD.U16.S16 y_mod, y, neg_str_h, y_prime; +// x = x_prime / str_w +// x_mod = x_prime % str_w +--:-:-:-:1 XMAD x, x_prime, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x, x, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod, x, neg_str_w, x_prime; + +--:-:-:-:1 XMAD x1, x_prime1, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x1, x1, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod1, x1, neg_str_w, x_prime1; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z_prime, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y_prime, RZ, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.EQ.AND P0, PT, z_mod, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, y_mod, RZ, P1; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P3; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + +--:-:-:-:1 ISETP.GE.AND P0, PT, x_prime, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x_prime1, RZ, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.EQ.AND P0, PT, x_mod, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, x_mod1, RZ, P1; + +// sliceI = z*HWN + y*WN + x*2 +01:-:-:-:1 XMAD.LO2C slice0I0, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I0, y, param_WN, slice0I0; +--:-:-:-:1 ISCADD slice0I1, x1, slice0I0, 1; +--:-:-:-:1 ISCADD slice0I0, x, slice0I0, 1; + +--:-:-:-:1 @!P0 MOV slice0I0, -1; +--:-:-:-:1 @!P1 MOV slice0I1, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 3; +--:-:-:-:1 IADD trs, trs, warp_inc; + +--:1:-:-:1 @P5 STS.64 [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : $SN ? q{ +// x_prime = qs + s +// y_prime = pr + r +// z_prime = mt + t +--:-:-:-:1 XMAD z_prime, t, param_dil_d, mt; +--:-:-:-:1 XMAD y_prime, r, param_dil_h, pr; +--:-:-:-:1 XMAD x_prime, s, param_dil_w, qs; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z_prime, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y_prime, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P2, PT, x_prime, RZ, P5; + +// z = z_prime / str_d +// z_prime = z_prime % str_d +--:-:-:-:1 XMAD z, z_prime, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z, z, param_shift_str_d; +--:-:-:-:1 XMAD.U16.S16 z_prime, z, neg_str_d, z_prime; +// y = y_prime / str_h +// y_prime = y_prime % str_h +--:-:-:-:1 XMAD y, y_prime, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y, y, param_shift_str_h; +--:-:-:-:1 XMAD.U16.S16 y_prime, y, neg_str_h, y_prime; +// x = x_prime / str_w +// x_prime = x_prime % str_w +--:-:-:-:1 XMAD x, x_prime, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x, x, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_prime, x, neg_str_w, x_prime; + +--:-:-:-:1 ISETP.EQ.AND P0, PT, z_prime, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, y_prime, RZ, P1; +--:-:-:-:1 ISETP.EQ.AND P2, PT, x_prime, RZ, P2; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x, param_W, P2; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD.LO2C slice0I, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I, y, param_WN, slice0I; +--:-:-:-:1 XMAD slice0I, x, param_N, slice0I; + + +--:-:-:-:1 @!P0 MOV slice0I, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 2; +--:-:-:-:1 IADD trs, trs, warp_inc; + + +--:1:-:-:1 @P5 STS [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : q{ +// x_prime = qs + s +// y_prime = pr + r +// z_prime = mt + t +--:-:-:-:1 XMAD z_prime, t, param_dil_d, mt; +--:-:-:-:1 XMAD y_prime, r, param_dil_h, pr; +--:-:-:-:1 XMAD x_prime, s, param_dil_w, qs; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z_prime, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y_prime, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P2, PT, x_prime, RZ, P5; + +// z = z_prime / str_d +// z_prime = z_prime % str_d +--:-:-:-:1 XMAD z, z_prime, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z, z, param_shift_str_d; +--:-:-:-:1 XMAD.U16.S16 z_prime, z, neg_str_d, z_prime; +// y = y_prime / str_h +// y_prime = y_prime % str_h +--:-:-:-:1 XMAD y, y_prime, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y, y, param_shift_str_h; +--:-:-:-:1 XMAD.U16.S16 y_prime, y, neg_str_h, y_prime; +// x = x_prime / str_w +// x_prime = x_prime % str_w +--:-:-:-:1 XMAD x, x_prime, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x, x, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_prime, x, neg_str_w, x_prime; + +--:-:-:-:1 ISETP.EQ.AND P0, PT, z_prime, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, y_prime, RZ, P1; +--:-:-:-:1 ISETP.EQ.AND P2, PT, x_prime, RZ, P2; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x, param_W, P2; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD sliceI, x, param_N, sliceI; +// sliceF = trs * K +--:-:-:-:1 XMAD sliceF, trs, param_K, RZ; + + + +// Get a mask of all valid slices in the warp +--:-:-:-:1 VOTE.ANY ballot, PT, P0; +// Count the total valid slices +--:-:2:-:1 POPC warp_slices, ballot; +// Prepare lutStore for this and next loop +--:-:-:-:1 @P0 MOV lutStore, lutStore2; +02:-:-:-:1 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +--:-:-:-:1 @P0 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +--:-:3:-:1 @P0 POPC dep_thd_cnt, dep_thd_bits; +// use the trs increment to space the barrier sync +--:-:-:-:1 IADD trs, trs, warp_inc; +// Update the lutStore address from this count +04:-:-:-:1 @P0 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +--:1:-:-:1 @P0 STS.64 [lutStore + addr_lut], sliceIF; + +// Keep track of the total size of the lut +--:-:-:-:1 IADD lut_size, lut_size, warp_slices; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +--:1:-:-:2 STS [addr_szLut], lut_size; + }; + } ++] + +END_SETUP: + +01:-:-:-:5 BAR.SYNC 0; + +// Grab the caclulated lut size and get it's reciprical +// Get the total reduction depth +[+ + our $LN; return $LN ? q{ +--:-:1:-:2 LDS lutSize, [addr_szLut]; + } : q{ +--:-:-:-:6 MOV lutSize, param_TRS; + }; ++] +01:-:-:-:0 XMAD endCTRS, lutSize, param_C, RZ; +--:-:1:-:2 I2F.F32.S32 lutSizeRcp, lutSize; +--:-:-:-:0 IADD lutSizeM1, lutSize, -1; +01:-:1:-:1 MUFU.RCP lutSizeRcp, lutSizeRcp; + + +--:-:-:-:1 IADD endCTRS32, endCTRS, 32; +// posCTRS = tidY +//--:-:-:-:1 MOV posCTRS, tidY; +// If this value is not a multiple of 32 we want to grab the partial amount on the first fetch. +// If it is a multiple of 32 then make a full 32 line fetch. +--:-:-:-:1 LOP.AND.Z P5, partial, endCTRS, 31; +--:-:-:-:1 @P5 MOV partial, 32; +// channel = posCTRS / lutSize +// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it +--:-:2:-:1 I2F.F32.S32 posCTRSf, tidY; +03:-:-:-:1 FMUL channel, posCTRSf, lutSizeRcp; +--:-:-:-:1 FFMA channel, channel, 5.9604644775390625e-08, channel; +--:-:2:-:1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCTRS % lutSize) * 8 +02:-:-:-:1 VMAD.U16.U16 lutOffset0, -channel, lutSize, tidY; + +--:-:-:-:1 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT; + +// posCTRS = tidY + partial +--:-:-:-:1 IADD posCTRS, tidY, partial; +--:-:-:-:1 IADD tidY1, tidY, 1; +[+ + our ($N1, $N2, $LN, $dshift, $slice_scale, $slice_offset, $slice_load); + return $LN ? q{ +// P5 = tidY < partial && lutSize != 0 +--:-:-:-:1 LOP.AND.NZ P6, RZ, lutSize, -1; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, partial, P6; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY1, partial, P6; + +--:-:-:-:1 SHL lutOffset0, lutOffset0, 3; + +// offsetFC = channel * KRST +// offsetIC = channel * DHWN +--:-:-:-:1 XMAD.LO2C offsetIc0, channel, param_DHWN, RZ; +--:-:-:-:1 XMAD offsetFc0, channel, param_TRSK, RZ; + +--:-:-:-:1 @P0 IADD lutOffset1, lutOffset0, 8; +--:-:-:-:1 @P0 MOV offsetFc1, offsetFc0; +--:-:-:-:1 @P0 MOV offsetIc1, offsetIc0; +--:-:-:-:1 @!P0 MOV lutOffset1, RZ; +--:-:-:-:1 @!P0 IADD offsetFc1, offsetFc0, param_TRSK; +--:-:-:-:1 @!P0 IADD offsetIc1, offsetIc0, param_DHWN; + +--:-:5:-:1 @P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut]; +--:-:6:-:1 @P6 LDS.U.64 slice1IF, [lutOffset1 + addr_lut]; + } : qq{ +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, partial, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY1, partial, PT; + +--:-:-:-:1 XMAD offsetFc0, tidY, param_K, RZ; +--:-:-:-:1 XMAD offsetFc1, tidY1, param_K, RZ; + +--:-:-:-:1 XMAD partial, partial, param_K, RZ; +--:-:-:-:1 SHL partial, partial, $dshift; + +--:-:-:-:1 ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale; +--:-:-:-:1 XMAD.LO2C offsetIc0, channel, param_DHWN, RZ; + +--:-:-:-:1 \@P0 IADD lutOffset1, lutOffset0, $slice_offset; +--:-:-:-:1 \@P0 MOV offsetIc1, offsetIc0; +--:-:-:-:1 \@!P0 MOV lutOffset1, sb_offset; +--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN; + +--:-:5:-:1 \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4]; +--:-:6:-:1 \@P6 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4]; + }; ++] + + + +[+ + our ($LN, $dshift); + return $LN ? qq{ +10:-:-:-:1 IADD3 offsetFc0, offsetFc0, sliceF0, k; +--:-:-:-:1 LEA track0F0.CC, offsetFc0, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track0F1, offsetFc0, param_F[1], RZ, $dshift; + +20:-:-:-:1 IADD3 offsetFc1, offsetFc1, sliceF1, k; +--:-:-:-:1 LEA track1F0.CC, offsetFc1, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1F1, offsetFc1, param_F[1], RZ, $dshift; + } : qq{ +--:-:-:-:1 IADD offsetFc0, offsetFc0, k; +--:-:-:-:1 IADD offsetFc1, offsetFc1, k; +--:-:-:-:1 LEA track0F0.CC, offsetFc0, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track0F1, offsetFc0, param_F[1], RZ, $dshift; +--:-:-:-:1 LEA track1F0.CC, offsetFc1, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1F1, offsetFc1, param_F[1], RZ, $dshift; + }; ++] +[+ + our ($K1, $dtype, $vsize, $dsize); + return $K1 ? qq{ +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; +--:-:-:-:1 \@!P0 MOV F00, RZ; +--:-:-:-:1 \@!P1 MOV F01, RZ; +--:-:-:-:1 \@!P2 MOV F02, RZ; +--:-:-:-:1 \@!P3 MOV F03, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>]; +--:-:1:-:1 \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>]; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.L.U64 preds, preds, 4, preds; +--:-:-:-:1 \@!P0 MOV F10, RZ; +--:-:-:-:1 \@!P1 MOV F11, RZ; +--:-:-:-:1 \@!P2 MOV F12, RZ; +--:-:-:-:1 \@!P3 MOV F13, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>]; + +--:-:-:-:1 \@P6 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P6 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; +--:-:-:-:1 \@!P0 MOV F20, RZ; +--:-:-:-:1 \@!P1 MOV F21, RZ; +--:-:-:-:1 \@!P2 MOV F22, RZ; +--:-:-:-:1 \@!P3 MOV F23, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>]; +--:-:3:-:1 \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>]; + +--:-:-:-:1 \@P6 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P6 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.L.U64 preds, preds, 4, preds; +--:-:-:-:1 \@!P0 MOV F30, RZ; +--:-:-:-:1 \@!P1 MOV F31, RZ; +--:-:-:-:1 \@!P2 MOV F32, RZ; +--:-:-:-:1 \@!P3 MOV F33, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>]; +--:-:4:-:1 \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>]; + } : qq{ + +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k, param_Km32, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k, param_K, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, k, param_Km32, P6; + + +--:-:1:-:1 \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>]; +--:-:2:-:1 \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>]; +--:-:3:-:1 \@P2 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>]; +--:-:4:-:1 \@P3 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>]; + +--:-:-:-:1 \@!P0 LDS.U.$vsize F0, [addr_zero]; +--:-:-:-:1 \@!P1 LDS.U.$vsize F1, [addr_zero]; +--:-:-:-:1 \@!P2 LDS.U.$vsize F2, [addr_zero]; +--:-:1:-:1 \@!P3 LDS.U.$vsize F3, [addr_zero]; + + }; ++] + + +[+ + our ($N1, $N2, $SN, $dshift, $vsizeI); + return $N1 ? qq{ +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P2, PT, slice0I2, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice0I3, RZ, P5; +--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0; +--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0; +--:-:-:-:1 IADD slice0I2, slice0I2, offsetIc0; +--:-:-:-:1 IADD slice0I3, slice0I3, offsetIc0; +--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I4.CC, slice0I2, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I5, slice0I2, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I6.CC, slice0I3, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I7, slice0I3, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0]; +--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I01, [track0I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I02, [track0I4]; +--:-:5:-:1 \@P3 LDG.E.CI.$vsizeI I03, [track0I6]; + +--:-:-:-:1 \@!P0 MOV I00, RZ; +--:-:-:-:1 \@!P1 MOV I01, RZ; +--:-:-:-:1 \@!P2 MOV I02, RZ; +--:-:-:-:1 \@!P3 MOV I03, RZ; + +20:-:-:-:1 ISETP.GE.AND P0, PT, slice1I0, RZ, P6; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice1I1, RZ, P6; +--:-:-:-:1 ISETP.GE.AND P2, PT, slice1I2, RZ, P6; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I3, RZ, P6; +--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1; +--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1; +--:-:-:-:1 IADD slice1I2, slice1I2, offsetIc1; +--:-:-:-:1 IADD slice1I3, slice1I3, offsetIc1; +--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I4.CC, slice1I2, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I5, slice1I2, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I6.CC, slice1I3, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I7, slice1I3, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I10, [track1I0]; +--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I11, [track1I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I12, [track1I4]; +--:-:6:-:1 \@P3 LDG.E.CI.$vsizeI I13, [track1I6]; + +--:-:-:-:1 \@!P0 MOV I10, RZ; +--:-:-:-:1 \@!P1 MOV I11, RZ; +--:-:-:-:1 \@!P2 MOV I12, RZ; +--:-:-:-:1 \@!P3 MOV I13, RZ; + + } : $N2 ? qq{ + +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5; +20:-:-:-:1 ISETP.GE.AND P2, PT, slice1I0, RZ, P6; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I1, RZ, P6; +--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0; +--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0; +--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1; +--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1; +--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0]; +--:-:5:-:1 \@P1 LDG.E.CI.$vsizeI I02, [track0I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I10, [track1I0]; +--:-:6:-:1 \@P3 LDG.E.CI.$vsizeI I12, [track1I2]; +--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero]; +--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero]; +--:-:-:-:1 \@!P2 LDS.U.$vsizeI I10, [addr_zero]; +--:-:5:-:1 \@!P3 LDS.U.$vsizeI I12, [addr_zero]; + + + } : $SN ? qq{ + +10:-:-:-:1 ISETP.GE.AND P5, PT, slice0I, RZ, P5; +20:-:-:-:1 ISETP.GE.AND P6, PT, slice1I, RZ, P6; +--:-:-:-:1 IADD3 slice0I, slice0I, offsetIc0, n; +--:-:-:-:1 IADD3 slice1I, slice1I, offsetIc1, n; +--:-:-:-:1 LEA track0I0.CC, slice0I, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, slice1I, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I, param_I[1], RZ, $dshift; + +--:-:5:-:1 \@P5 LDG.E.CI.$vsizeI I0, [track0I]; +--:-:6:-:1 \@P6 LDG.E.CI.$vsizeI I1, [track1I]; +--:-:-:-:1 \@!P5 LDS.U.$vsizeI I0, [addr_zero]; +--:-:5:-:1 \@!P6 LDS.U.$vsizeI I1, [addr_zero]; + + + } : qq{ +--:-:-:-:1 IADD3 offsetIc0, offsetIc0, sliceI0, n; +--:-:-:-:1 IADD3 offsetIc1, offsetIc1, sliceI1, n; +--:-:-:-:1 LEA track0I0.CC, offsetIc0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, offsetIc0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, offsetIc1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, offsetIc1, param_I[1], RZ, $dshift; + +--:-:5:-:1 \@P5 LDG.E.CI.$vsizeI I0, [track0I]; +--:-:6:-:1 \@P6 LDG.E.CI.$vsizeI I1, [track1I]; +--:-:-:-:1 \@!P5 LDS.U.$vsizeI I0, [addr_zero]; +--:-:5:-:1 \@!P6 LDS.U.$vsizeI I1, [addr_zero]; + + }; ++] + + +[+ + our ($convert_in, $K1); + return !$convert_in ? '' : $K1 ? qq{ +01:-:-:-:1 $convert_in F00, F00; +--:-:-:-:1 $convert_in F01, F01; +--:-:-:-:1 $convert_in F02, F02; +--:-:1:-:1 $convert_in F03, F03; + +02:-:-:-:1 $convert_in F10, F10; +--:-:-:-:1 $convert_in F11, F11; +--:-:-:-:1 $convert_in F12, F12; +--:-:2:-:1 $convert_in F13, F13; + +04:-:-:-:1 $convert_in F20, F20; +--:-:-:-:1 $convert_in F21, F21; +--:-:-:-:1 $convert_in F22, F22; +--:-:3:-:1 $convert_in F23, F23; + +08:-:-:-:1 $convert_in F30, F30; +--:-:-:-:1 $convert_in F31, F31; +--:-:-:-:1 $convert_in F32, F32; +--:-:4:-:1 $convert_in F33, F33; + } : qq{ +01:-:-:-:1 $convert_in F03, F01.H1; +--:-:-:-:1 $convert_in F02, F01.H0; +--:-:-:-:1 $convert_in F01, F00.H1; +--:-:1:-:1 $convert_in F00, F00.H0; + +02:-:-:-:1 $convert_in F13, F11.H1; +--:-:-:-:1 $convert_in F12, F11.H0; +--:-:-:-:1 $convert_in F11, F10.H1; +--:-:2:-:1 $convert_in F10, F10.H0; + +04:-:-:-:1 $convert_in F23, F21.H1; +--:-:-:-:1 $convert_in F22, F21.H0; +--:-:-:-:1 $convert_in F21, F20.H1; +--:-:3:-:1 $convert_in F20, F20.H0; + +08:-:-:-:1 $convert_in F33, F31.H1; +--:-:-:-:1 $convert_in F32, F31.H0; +--:-:-:-:1 $convert_in F31, F30.H1; +--:-:4:-:1 $convert_in F30, F30.H0; + }; ++] +[+ + our ($convert_in, $N1, $N2); + return !$convert_in ? '' : $N1 ? qq{ +10:-:-:-:1 $convert_in I03, I03; +--:-:-:-:1 $convert_in I02, I02; +--:-:-:-:1 $convert_in I01, I01; +--:-:5:-:1 $convert_in I00, I00; + +20:-:-:-:1 $convert_in I13, I13; +--:-:-:-:1 $convert_in I12, I12; +--:-:-:-:1 $convert_in I11, I11; +--:-:6:-:1 $convert_in I10, I10; + } : $N2 ? qq{ +10:-:-:-:1 $convert_in I03, I02.H1; +--:-:-:-:1 $convert_in I02, I02.H0; +--:-:-:-:1 $convert_in I01, I00.H1; +--:-:5:-:1 $convert_in I00, I00.H0; + +20:-:-:-:1 $convert_in I13, I12.H1; +--:-:-:-:1 $convert_in I12, I12.H0; +--:-:-:-:1 $convert_in I11, I10.H1; +--:-:6:-:1 $convert_in I10, I10.H0; + } : qq{ +10:-:-:-:1 $convert_in I03, I01.H1; +--:-:-:-:1 $convert_in I02, I01.H0; +--:-:-:-:1 $convert_in I01, I00.H1; +--:-:5:-:1 $convert_in I00, I00.H0; + +20:-:-:-:1 $convert_in I13, I11.H1; +--:-:-:-:1 $convert_in I12, I11.H0; +--:-:-:-:1 $convert_in I11, I10.H1; +--:-:6:-:1 $convert_in I10, I10.H0; + }; ++] + +01:-:-:-:1 STS.128 [writeFs + 4x<0*32>], F0; +02:-:-:-:1 STS.128 [writeFs + 4x<1*32>], F1; +04:-:-:-:1 STS.128 [writeFs + 4x<2*32>], F2; +08:-:-:-:1 STS.128 [writeFs + 4x<3*32>], F3; + +10:-:-:-:1 STS.128 [writeIs + 4x<0*32>], I0; +20:-:-:-:1 STS.128 [writeIs + 4x<1*32>], I1; + +--:-:-:-:0 ISETP.LT.AND P5, PT, posCTRS, endCTRS, PT; +--:-:5:-:1 I2F.F32.S32 posCTRSf, posCTRS; + +--:-:-:-:5 BAR.SYNC 0; + + +--:-:-:-:1 IADD writeFs, writeFs, swapBuf; +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + + +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ix0, [readIs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0Ix4, [readIs + 4x<0*32 + 16>]; + + +10:-:-:-:1 FMUL channel, posCTRSf, lutSizeRcp; +--:-:-:-:1 FFMA channel, channel, 5.9604644775390625e-08, channel; +--:-:5:-:1 F2I.S32.F32.TRUNC channel, channel; + +10:-:-:-:1 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS; +--:-:-:-:1 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT; +[+ + our ($N1, $N2, $LN, $dshift, $slice_scale, $slice_offset, $slice_load); + return $LN ? q{ + +--:-:-:-:1 SHL lutOffset0, lutOffset0, 3; +--:-:-:-:1 XMAD.LO2C offsetIc0, channel, param_DHWN, RZ; +--:-:-:-:1 XMAD offsetFc0, channel, param_TRSK, RZ; + +--:-:-:-:1 @P0 IADD lutOffset1, lutOffset0, 8; +--:-:-:-:1 @P0 MOV offsetFc1, offsetFc0; +--:-:-:-:1 @P0 MOV offsetIc1, offsetIc0; +--:-:-:-:1 @!P0 MOV lutOffset1, RZ; +--:-:-:-:1 @!P0 IADD offsetFc1, offsetFc0, param_TRSK; +--:-:-:-:1 @!P0 IADD offsetIc1, offsetIc0, param_DHWN; + +--:-:-:-:1 IADD posCTRS, posCTRS, 32; +--:-:5:-:1 @P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut]; +--:-:6:-:1 @P5 LDS.U.64 slice1IF, [lutOffset1 + addr_lut]; + + } : qq{ + +--:-:-:-:1 ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale; +--:-:-:-:1 XMAD.LO2C offsetIc0, channel, param_DHWN, RZ; + +--:-:-:-:1 \@P0 IADD lutOffset1, lutOffset0, $slice_offset; +--:-:-:-:1 \@P0 MOV offsetIc1, offsetIc0; +--:-:-:-:1 \@!P0 MOV lutOffset1, sb_offset; +--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN; + +--:-:-:-:1 IADD posCTRS, posCTRS, 32; +--:-:5:-:1 \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4]; +--:-:6:-:1 \@P5 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4]; + }; ++] + + + + +[+ + our ($LN, $dshift); + return $LN ? qq{ +10:-:-:-:1 IADD3 offsetFc0, offsetFc0, sliceF0, k; +--:-:-:-:1 LEA track0F0.CC, offsetFc0, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track0F1, offsetFc0, param_F[1], RZ, $dshift; + +20:-:-:-:1 IADD3 offsetFc1, offsetFc1, sliceF1, k; +--:-:-:-:1 LEA track1F0.CC, offsetFc1, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1F1, offsetFc1, param_F[1], RZ, $dshift; + } : qq{ +--:-:-:-:1 IADD track0F0.CC, track0F0, partial; +--:-:-:-:1 IADD.X track0F1, track0F1, RZ; +--:-:-:-:1 IADD track1F0.CC, track1F0, partial; +--:-:-:-:1 IADD.X track1F1, track1F1, RZ; + }; ++] + +[+ + our ($K1, $dtype, $vsize, $dsize); + return $K1 ? qq{ +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 \@P5 SHF.R.U64 preds, preds, 4, preds; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>]; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 \@P5 SHF.L.U64 preds, preds, 4, preds; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>]; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 \@P5 SHF.R.U64 preds, preds, 4, preds; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>]; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 \@P5 SHF.L.U64 preds, preds, 4, preds; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>]; + } : qq{ +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k, param_Km32, P5; + +--:-:2:-:1 \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>]; +--:-:2:-:1 \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>]; +--:-:2:-:1 \@P0 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>]; +--:-:2:-:1 \@P1 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>]; + }; ++] + + + + +[+ + our ($N1, $N2, $SN, $dshift, $vsizeI); + return $N1 ? qq{ + +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P2, PT, slice0I2, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice0I3, RZ, P5; + +--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0; +--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0; +--:-:-:-:1 IADD slice0I2, slice0I2, offsetIc0; +--:-:-:-:1 IADD slice0I3, slice0I3, offsetIc0; +--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I4.CC, slice0I2, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I5, slice0I2, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I6.CC, slice0I3, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I7, slice0I3, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0]; +--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I01, [track0I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I02, [track0I4]; +--:-:2:-:1 \@P3 LDG.E.CI.$vsizeI I03, [track0I6]; + +--:-:-:-:1 \@!P0 MOV I00, RZ; +--:-:-:-:1 \@!P1 MOV I01, RZ; +--:-:-:-:1 \@!P2 MOV I02, RZ; +--:-:-:-:1 \@!P3 MOV I03, RZ; + + +20:-:-:-:1 ISETP.GE.AND P0, PT, slice1I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice1I1, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P2, PT, slice1I2, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I3, RZ, P5; + +--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1; +--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1; +--:-:-:-:1 IADD slice1I2, slice1I2, offsetIc1; +--:-:-:-:1 IADD slice1I3, slice1I3, offsetIc1; +--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I4.CC, slice1I2, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I5, slice1I2, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I6.CC, slice1I3, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I7, slice1I3, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I10, [track1I0]; +--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I11, [track1I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I12, [track1I4]; +--:3:2:-:1 \@P3 LDG.E.CI.$vsizeI I13, [track1I6]; + +--:-:-:-:1 \@!P0 MOV I10, RZ; +--:-:-:-:1 \@!P1 MOV I11, RZ; +--:-:-:-:1 \@!P2 MOV I12, RZ; +--:-:-:-:1 \@!P3 MOV I13, RZ; + + } : $N2 ? qq{ + +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5; +20:-:-:-:1 ISETP.GE.AND P2, PT, slice1I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I1, RZ, P5; + +--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0; +--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0; +--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1; +--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1; +--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0]; +--:-:2:-:1 \@P1 LDG.E.CI.$vsizeI I02, [track0I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I10, [track1I0]; +--:3:2:-:1 \@P3 LDG.E.CI.$vsizeI I12, [track1I2]; +--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero]; +--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero]; +--:-:-:-:1 \@!P2 LDS.U.$vsizeI I10, [addr_zero]; +--:-:-:-:1 \@!P3 LDS.U.$vsizeI I12, [addr_zero]; + + + } : $SN ? qq{ + +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I, RZ, P5; +20:-:-:-:1 ISETP.GE.AND P1, PT, slice1I, RZ, P5; +--:-:-:-:1 IADD3 slice0I, slice0I, offsetIc0, n; +--:-:-:-:1 IADD3 slice1I, slice1I, offsetIc1, n; +--:-:-:-:1 LEA track0I0.CC, slice0I, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, slice1I, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I, param_I[1], RZ, $dshift; + +--:-:2:-:1 \@P0 LDG.E.CI.$vsizeI I0, [track0I]; +--:3:2:-:1 \@P1 LDG.E.CI.$vsizeI I1, [track1I]; +--:-:-:-:1 \@!P0 LDS.U.$vsizeI I0, [addr_zero]; +--:-:-:-:1 \@!P1 LDS.U.$vsizeI I1, [addr_zero]; + + + } : qq{ +--:-:-:-:1 IADD3 offsetIc0, offsetIc0, sliceI0, n; +--:-:-:-:1 IADD3 offsetIc1, offsetIc1, sliceI1, n; +--:-:-:-:1 LEA track0I0.CC, offsetIc0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, offsetIc0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, offsetIc1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, offsetIc1, param_I[1], RZ, $dshift; + +--:-:2:-:1 \@P5 LDG.E.CI.$vsizeI I0, [track0I]; +--:3:2:-:1 \@P5 LDG.E.CI.$vsizeI I1, [track1I]; + + }; ++] + + +LOOP: +[+ + our ($N1, $N2, $SN, $LN, $K1, $dtype, $dshift, $dsize, $vsize, $vsizeI, + $convert_in, $slice_scale, $slice_offset, $slice_load); + + my %insert = ( + j0c1 => "--:-:5:-:1 I2F.F32.S32 posCTRSf, posCTRS;\n", + j0c3 => "--:-:-:-:1 ISETP.LT.AND P5, PT, posCTRS, endCTRS, PT;\n", + j0c5 => "--:-:-:-:1 ISETP.LT.AND P6, PT, posCTRS, endCTRS32, PT;\n", + + j0c15 => "10:-:-:-:1 \@P5 FMUL channel, posCTRSf, lutSizeRcp;\n", + j0c20 => "--:-:-:-:1 \@P5 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c22 => "--:-:5:-:1 \@P5 F2I.S32.F32.TRUNC channel, channel;\n", + + $LN ? ( + j0c36 => "10:-:-:-:1 \@P5 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;\n" . + "--:-:-:-:1 \@P5 XMAD offsetIc0, channel, param_DHWN, RZ;\n" . + "--:-:-:-:1 \@P5 XMAD offsetFc0, channel, param_TRSK, RZ;\n" . + "--:-:-:-:1 IADD posCTRS, posCTRS, 32;\n", + + j0c38 => "--:-:-:-:1 \@P5 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;\n" . + "--:-:-:-:1 \@P5 XMAD.PSL offsetIc0, channel, param_DHWN.H1, offsetIc0;\n" . + "--:-:-:-:1 \@P5 SHL lutOffset0, lutOffset0, 3;\n", + + j0c42 => "--:-:5:-:1 \@P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut];\n", + + j0c49 => "--:-:-:-:1 \@P0 I2I.U32.U32 offsetFc1, offsetFc0;\n" . + "--:-:-:-:1 \@!P0 IADD offsetFc1, offsetFc0, param_TRSK;\n", + + j0c50 => "--:-:-:-:1 \@P0 I2I.U32.U32 offsetIc1, offsetIc0;\n" . + "--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;\n", + + j0c51 => "--:-:4:-:1 \@!P0 I2I.U32.U32 lutOffset1, RZ;\n" . + "--:-:-:-:1 \@P0 IADD lutOffset1, lutOffset0, 8;\n", + + j1c44 => "10:-:-:-:1 \@P5 IADD3 offsetFc0, offsetFc0, sliceF0, k;\n", + j1c49 => "04:-:-:-:1 \@P5 LEA track0F0.CC, offsetFc0, param_F[0], $dshift;\n", + j1c54 => "--:-:-:-:1 \@P5 LEA.HI.X track0F1, offsetFc0, param_F[1], RZ, $dshift;\n", + + j2c16 => "08:-:5:-:1 \@P5 LDS.U.64 slice1IF, [lutOffset1 + addr_lut];\n", + + j3c44 => "10:-:-:-:1 \@P5 IADD3 offsetFc1, offsetFc1, sliceF1, k;\n", + j3c49 => "--:-:-:-:1 \@P5 LEA track1F0.CC, offsetFc1, param_F[0], $dshift;\n", + j3c54 => "--:-:-:-:1 \@P5 LEA.HI.X track1F1, offsetFc1, param_F[1], RZ, $dshift;\n", + + j5c44 => "--:-:-:-:1 \@P5 IADD3 offsetIc0, offsetIc0, sliceI0, n;\n", + j5c49 => "--:-:-:-:1 \@P5 LEA track0I0.CC, offsetIc0, param_I[0], $dshift;\n", + j5c54 => "--:-:-:-:1 \@P5 LEA.HI.X track0I1, offsetIc0, param_I[1], RZ, $dshift;\n", + j5c60 => "20:-:2:-:1 \@P5 LDG.E.CI.$vsize I0, [track0I];\n", + + j6c44 => "--:-:-:-:1 \@P5 IADD3 offsetIc1, offsetIc1, sliceI1, n;\n", + j6c49 => "--:-:-:-:1 \@P5 LEA track1I0.CC, offsetIc1, param_I[0], $dshift;\n", + j6c54 => "--:-:-:-:1 \@P5 LEA.HI.X track1I1, offsetIc1, param_I[1], RZ, $dshift;\n", + j6c60 => "20:3:2:-:1 \@P5 LDG.E.CI.$vsize I1, [track1I];\n", + + ) : ( + j0c36 => "10:-:-:-:1 \@P5 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;\n" . + "--:-:-:-:1 \@P5 XMAD offsetIc0, channel, param_DHWN, RZ;\n" . + "--:-:-:-:1 IADD posCTRS, posCTRS, 32;\n", + + j0c39 => "--:-:-:-:1 \@P5 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;\n" . + "--:-:-:-:1 \@P5 XMAD.PSL offsetIc0, channel, param_DHWN.H1, offsetIc0;\n" . + "--:-:-:-:1 \@P5 ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale;\n", + + j0c43 => "--:-:-:-:1 \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4];\n", + + j0c50 => "--:-:-:-:1 \@P0 I2I.U32.U32 offsetIc1, offsetIc0;\n" . + "--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;\n", + + j0c51 => "--:-:4:-:1 \@!P0 I2I.U32.U32 lutOffset1, sb_offset;\n" . + "--:-:-:-:1 \@P0 IADD lutOffset1, lutOffset0, $slice_offset;\n", + + j2c16 => "08:-:-:-:1 \@P5 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4];\n", + + j1c49 => "04:-:-:-:1 \@P5 IADD track0F0.CC, track0F0, param_K32p;\n", + j1c54 => "--:-:-:-:1 \@P5 IADD.X track0F1, track0F1, RZ;\n", + + j3c49 => "--:-:-:-:1 \@P5 IADD track1F0.CC, track1F0, param_K32p;\n", + j3c54 => "--:-:-:-:1 \@P5 IADD.X track1F1, track1F1, RZ;\n", + ), + + $N1 ? ( + + j5c31 => "--:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0;\n" . + "--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0;\n" . + "--:-:-:-:1 ISETP.GE.AND P2, PT, slice0I2, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I2, slice0I2, offsetIc0;\n" . + "--:-:-:-:1 ISETP.GE.AND P3, PT, slice0I3, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I3, slice0I3, offsetIc0;\n", + + j5c32 => "--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift;\n", + j5c37 => "--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift;\n", + j5c42 => "--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track0I4.CC, slice0I2, param_I[0], $dshift;\n", + j5c47 => "--:-:-:-:1 LEA.HI.X track0I5, slice0I2, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track0I6.CC, slice0I3, param_I[0], $dshift;\n", + j5c52 => "--:-:-:-:1 LEA.HI.X track0I7, slice0I3, param_I[1], RZ, $dshift;\n", + + j5c55 => "20:-:-:-:1 \@!P0 I2I.U32.U32 I00, RZ;\n", + j5c57 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I01, RZ;\n", + j5c59 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I02, RZ;\n", + j5c61 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I03, RZ;\n", + + j5c56 => "--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0];\n", + j5c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I01, [track0I2];\n", + j5c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I02, [track0I4];\n", + j5c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$vsizeI I03, [track0I6];\n", + + j6c31 => "--:-:-:-:1 ISETP.GE.AND P0, PT, slice1I0, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1;\n" . + "--:-:-:-:1 ISETP.GE.AND P1, PT, slice1I1, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1;\n" . + "--:-:-:-:1 ISETP.GE.AND P2, PT, slice1I2, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I2, slice1I2, offsetIc1;\n" . + "--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I3, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I3, slice1I3, offsetIc1;\n", + + j6c32 => "--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift;\n", + j6c37 => "--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift;\n", + j6c42 => "--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track1I4.CC, slice1I2, param_I[0], $dshift;\n", + j6c47 => "--:-:-:-:1 LEA.HI.X track1I5, slice1I2, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track1I6.CC, slice1I3, param_I[0], $dshift;\n", + j6c52 => "--:-:-:-:1 LEA.HI.X track1I7, slice1I3, param_I[1], RZ, $dshift;\n", + + j6c55 => "20:-:-:-:1 \@!P0 I2I.U32.U32 I10, RZ;\n", + j6c57 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I11, RZ;\n", + j6c59 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I12, RZ;\n", + j6c61 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I13, RZ;\n", + + j6c56 => "--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I10, [track1I0];\n", + j6c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I11, [track1I2];\n", + j6c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I12, [track1I4];\n", + j6c62 => "--:3:2:-:1 \@P3 LDG.E.CI.$vsizeI I13, [track1I6];\n", + + ) : $N2 ? ( + + j5c31 => "--:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0;\n" . + "--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0;\n", + + j5c35 => "--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift;\n", + j5c40 => "--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift;\n", + j5c45 => "--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift;\n", + + j5c46 => "--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero];\n", + j5c47 => "--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero];\n", + + j5c60 => "20:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0];\n", + j5c62 => "--:-:2:-:1 \@P1 LDG.E.CI.$vsizeI I02, [track0I2];\n", + + j6c31 => "--:-:-:-:1 ISETP.GE.AND P0, PT, slice1I0, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1;\n" . + "--:-:-:-:1 ISETP.GE.AND P1, PT, slice1I1, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1;\n", + + j6c35 => "--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift;\n", + j6c40 => "--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift;\n", + j6c45 => "--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift;\n", + + j6c46 => "--:-:-:-:1 \@!P0 LDS.U.$vsizeI I10, [addr_zero];\n", + j6c47 => "--:-:-:-:1 \@!P1 LDS.U.$vsizeI I12, [addr_zero];\n", + + j6c60 => "20:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I10, [track1I0];\n", + j6c62 => "--:3:2:-:1 \@P1 LDG.E.CI.$vsizeI I12, [track1I2];\n", + + ) : $SN ? ( + j5c31 => "--:-:-:-:1 ISETP.GE.AND P2, PT, slice0I, RZ, P5;\n", + j5c45 => "--:-:-:-:1 \@!P2 LDS.U.$vsize I0, [addr_zero];\n", + + j5c44 => "--:-:-:-:1 \@P5 IADD3 offsetIc0, offsetIc0, slice0I, n;\n", + j5c49 => "--:-:-:-:1 \@P5 LEA track0I0.CC, offsetIc0, param_I[0], $dshift;\n", + j5c54 => "--:-:-:-:1 \@P5 LEA.HI.X track0I1, offsetIc0, param_I[1], RZ, $dshift;\n", + j5c60 => "20:-:2:-:1 \@P2 LDG.E.CI.$vsize I0, [track0I];\n", + + j6c31 => "--:-:-:-:1 ISETP.GE.AND P2, PT, slice1I, RZ, P5;\n", + j6c45 => "--:-:-:-:1 \@!P2 LDS.U.$vsize I1, [addr_zero];\n", + + j6c44 => "--:-:-:-:1 \@P5 IADD3 offsetIc1, offsetIc1, slice1I, n;\n", + j6c49 => "--:-:-:-:1 \@P5 LEA track1I0.CC, offsetIc1, param_I[0], $dshift;\n", + j6c54 => "--:-:-:-:1 \@P5 LEA.HI.X track1I1, offsetIc1, param_I[1], RZ, $dshift;\n", + j6c60 => "20:3:2:-:1 \@P2 LDG.E.CI.$vsize I1, [track1I];\n", + ) : (), + + j1c30 => "20:6:-:-:1 \@P6 STS.128 [writeFs + 4x<0*32>], F0;\n", + j2c30 => "20:6:-:-:1 \@P6 STS.128 [writeFs + 4x<1*32>], F1;\n", + j3c30 => "20:6:-:-:1 \@P6 STS.128 [writeFs + 4x<2*32>], F2;\n", + j4c30 => "20:6:-:-:1 \@P6 STS.128 [writeFs + 4x<3*32>], F3;\n", + j5c30 => "20:6:-:-:1 \@P6 STS.128 [writeIs + 4x<0*32>], I0;\n", + j6c30 => "20:6:-:-:1 \@P6 STS.128 [writeIs + 4x<1*32>], I1;\n", + + $convert_in ? ( + j1c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j2c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j3c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j4c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j5c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j6c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + $K1 ? ( + j1c8 => "--:-:-:-:1 \@P6 $convert_in F00, F00;\n", + j1c10 => "--:-:-:-:1 \@P6 $convert_in F01, F01;\n", + j1c12 => "--:-:-:-:1 \@P6 $convert_in F02, F02;\n", + j1c14 => "--:-:6:-:1 \@P6 $convert_in F03, F03;\n", + + j2c8 => "--:-:-:-:1 \@P6 $convert_in F10, F10;\n", + j2c10 => "--:-:-:-:1 \@P6 $convert_in F11, F11;\n", + j2c12 => "--:-:-:-:1 \@P6 $convert_in F12, F12;\n", + j2c14 => "--:-:6:-:1 \@P6 $convert_in F13, F13;\n", + + j3c8 => "--:-:-:-:1 \@P6 $convert_in F20, F20;\n", + j3c10 => "--:-:-:-:1 \@P6 $convert_in F21, F21;\n", + j3c12 => "--:-:-:-:1 \@P6 $convert_in F22, F22;\n", + j3c14 => "--:-:6:-:1 \@P6 $convert_in F23, F23;\n", + + j4c8 => "--:-:-:-:1 \@P6 $convert_in F30, F30;\n", + j4c10 => "--:-:-:-:1 \@P6 $convert_in F31, F31;\n", + j4c12 => "--:-:-:-:1 \@P6 $convert_in F32, F32;\n", + j4c14 => "--:-:6:-:1 \@P6 $convert_in F33, F33;\n", + ) : ( + j1c8 => "--:-:-:-:1 \@P6 $convert_in F03, F01.H1;\n", + j1c10 => "--:-:-:-:1 \@P6 $convert_in F02, F01.H0;\n", + j1c12 => "--:-:-:-:1 \@P6 $convert_in F01, F00.H1;\n", + j1c14 => "--:-:6:-:1 \@P6 $convert_in F00, F00.H0;\n", + + j2c8 => "--:-:-:-:1 \@P6 $convert_in F13, F11.H1;\n", + j2c10 => "--:-:-:-:1 \@P6 $convert_in F12, F11.H0;\n", + j2c12 => "--:-:-:-:1 \@P6 $convert_in F11, F10.H1;\n", + j2c14 => "--:-:6:-:1 \@P6 $convert_in F10, F10.H0;\n", + + j3c8 => "--:-:-:-:1 \@P6 $convert_in F23, F21.H1;\n", + j3c10 => "--:-:-:-:1 \@P6 $convert_in F22, F21.H0;\n", + j3c12 => "--:-:-:-:1 \@P6 $convert_in F21, F20.H1;\n", + j3c14 => "--:-:6:-:1 \@P6 $convert_in F20, F20.H0;\n", + + j4c8 => "--:-:-:-:1 \@P6 $convert_in F33, F31.H1;\n", + j4c10 => "--:-:-:-:1 \@P6 $convert_in F32, F31.H0;\n", + j4c12 => "--:-:-:-:1 \@P6 $convert_in F31, F30.H1;\n", + j4c14 => "--:-:6:-:1 \@P6 $convert_in F30, F30.H0;\n", + ), + $N1 ? ( + j5c8 => "--:-:-:-:1 \@P6 $convert_in I03, I03;\n", + j5c10 => "--:-:-:-:1 \@P6 $convert_in I02, I02;\n", + j5c12 => "--:-:-:-:1 \@P6 $convert_in I01, I01;\n", + j5c14 => "--:-:6:-:1 \@P6 $convert_in I00, I00;\n", + + j6c8 => "--:-:-:-:1 \@P6 $convert_in I13, I13;\n", + j6c10 => "--:-:-:-:1 \@P6 $convert_in I12, I12;\n", + j6c12 => "--:-:-:-:1 \@P6 $convert_in I11, I11;\n", + j6c14 => "--:-:6:-:1 \@P6 $convert_in I10, I10;\n", + ) : $N2 ? ( + j5c8 => "--:-:-:-:1 \@P6 $convert_in I03, I02.H1;\n", + j5c10 => "--:-:-:-:1 \@P6 $convert_in I02, I02.H0;\n", + j5c12 => "--:-:-:-:1 \@P6 $convert_in I01, I00.H1;\n", + j5c14 => "--:-:6:-:1 \@P6 $convert_in I00, I00.H0;\n", + + j6c8 => "--:-:-:-:1 \@P6 $convert_in I13, I12.H1;\n", + j6c10 => "--:-:-:-:1 \@P6 $convert_in I12, I12.H0;\n", + j6c12 => "--:-:-:-:1 \@P6 $convert_in I11, I10.H1;\n", + j6c14 => "--:-:6:-:1 \@P6 $convert_in I10, I10.H0;\n", + ) : ( + j5c8 => "--:-:-:-:1 \@P6 $convert_in I03, I01.H1;\n", + j5c10 => "--:-:-:-:1 \@P6 $convert_in I02, I01.H0;\n", + j5c12 => "--:-:-:-:1 \@P6 $convert_in I01, I00.H1;\n", + j5c14 => "--:-:6:-:1 \@P6 $convert_in I00, I00.H0;\n", + + j6c8 => "--:-:-:-:1 \@P6 $convert_in I13, I11.H1;\n", + j6c10 => "--:-:-:-:1 \@P6 $convert_in I12, I11.H0;\n", + j6c12 => "--:-:-:-:1 \@P6 $convert_in I11, I10.H1;\n", + j6c14 => "--:-:6:-:1 \@P6 $convert_in I10, I10.H0;\n", + ), + ) : ( + j1c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j2c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j3c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j4c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j5c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j6c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + ), + + $K1 ? ( + j1c31 => "--:-:-:-:1 \@P5 R2P PR, preds, 0x0f;\n", + j1c32 => "--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f;\n", + j1c33 => "--:-:-:-:1 \@P5 SHF.R.U64 preds, preds, 4, preds;\n", + j1c56 => "20:-:-:-:1 \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>];\n", + j1c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>];\n", + j1c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>];\n", + j1c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>];\n", + + j2c31 => "--:-:-:-:1 \@P5 R2P PR, preds, 0x0f;\n", + j2c32 => "--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f;\n", + j2c33 => "--:-:-:-:1 \@P5 SHF.L.U64 preds, preds, 4, preds;\n", + j2c56 => "20:-:-:-:1 \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>];\n", + j2c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>];\n", + j2c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>];\n", + j2c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>];\n", + + j3c31 => "--:-:-:-:1 \@P5 R2P PR, preds, 0x0f;\n", + j3c32 => "--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f;\n", + j3c33 => "--:-:-:-:1 \@P5 SHF.R.U64 preds, preds, 4, preds;\n", + j3c56 => "20:-:-:-:1 \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>];\n", + j3c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>];\n", + j3c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>];\n", + j3c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>];\n", + + j4c31 => "--:-:-:-:1 \@P5 R2P PR, preds, 0x0f;\n", + j4c32 => "--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f;\n", + j4c33 => "--:-:-:-:1 \@P5 SHF.L.U64 preds, preds, 4, preds;\n", + j4c56 => "20:-:-:-:1 \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>];\n", + j4c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>];\n", + j4c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>];\n", + j4c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>];\n", + + ) : ( + j0c52 => "--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, P5;\n", + j0c53 => "--:-:-:-:1 ISETP.LT.AND P1, PT, k, param_Km32, P5;\n", + + j1c60 => "20:-:2:-:1 \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>];\n", + j2c60 => "20:-:2:-:1 \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>];\n", + j3c60 => "20:-:2:-:1 \@P0 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>];\n", + j4c60 => "20:-:2:-:1 \@P1 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>];\n", + ), + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P6 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P6' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx0, [readIs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dIx4, [readIs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + + +--:-:-:-:1 MOV alpha, param_alpha; + +--:-:-:-:1 ISETP.EQ.AND P4, PT, RZ, param_flags, PT; + +--:-:-:-:1 LOP.AND tid_31, tid, 31; +--:-:-:-:1 SHR.U32 tid_32, tid, 5; + +// readFs = (tid_32 << 7 + tid_31) << 2 +--:-:-:-:1 ISCADD readOs, tid_32, tid_31, 7; +--:-:-:-:1 SHL readOs, readOs, 2; + +--:-:-:-:1 ISETP.EQ.AND P6, PT, tid_31, RZ, PT; + +// k = idx_K*64 + tid_32 +--:-:-:-:1 ISCADD k00, idx_K, tid_32, 6; +--:-:-:-:1 IADD k04, k00, 4; +--:-:-:-:1 IADD k08, k00, 8; +--:-:-:-:1 IADD k12, k00, 12; + +[+ + our $bsum; return $bsum ? q{ +--:-:-:-:1 XMAD bsum_offset, idx_Q, param_gridN, idx_N; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_P, param_gridQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset; + } : ''; ++] + +[+ + our $LN; return $LN ? q{ +// n = idx_N*32 + tid31; +--:-:-:-:1 ISCADD N, idx_N, tid_31, 5; +// n < N +--:-:-:-:1 ISETP.LT.AND P4, PT, N, param_N, P4; + +// o = k*MPQN + m*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD offset, idx_Q, param_N, N; +--:-:-:-:1 XMAD.LO2C offset, idx_P, param_QN, offset; +--:-:-:-:1 XMAD.LO2C offset, idx_M, param_PQN, offset; + + } : q{ + +--:-:-:-:1 SHL M, idx_M, param_shiftM; +--:-:-:-:1 SHL P, idx_P, param_shiftP; +--:-:-:-:1 SHL Q, idx_Q, param_shiftQ; +--:-:-:-:1 SHL N, idx_N, param_shiftN; + +--:-:-:-:1 BFE.U32 super_M, tid_31, param_SuperM; +--:-:-:-:1 BFE.U32 super_P, tid_31, param_SuperP; +--:-:-:-:1 BFE.U32 super_Q, tid_31, param_SuperQ; +--:-:-:-:1 LOP.AND super_N, tid_31, param_SuperN; + +--:-:-:-:1 IADD M, M, super_M; +--:-:-:-:1 IADD P, P, super_P; +--:-:-:-:1 IADD Q, Q, super_Q; +--:-:-:-:1 IADD N, N, super_N; + +--:-:-:-:1 ISETP.LT.AND P0, PT, M, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, P, param_P, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, Q, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, N, param_N, P0; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P2; + +// o = k*MPQN + m*PQN + p*QN + q*N + N +--:-:-:-:1 XMAD offset, Q, param_N, N; +--:-:-:-:1 XMAD.LO2C offset, P, param_QN, offset; +--:-:-:-:1 XMAD.LO2C offset, M, param_PQN, offset; + }; ++] +--:-:-:-:1 XMAD.LO2C offset, k00, param_MPQN, offset; + +--:-:-:-:1 MOV MPQN16, param_MPQN; +--:-:-:-:1 SHL MPQN4, MPQN16, [+ dshift()+2 +]; +--:-:-:-:1 SHL MPQN16, MPQN16, 4; + +--:-:-:-:1 MOV32I one, 1.0; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:1 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:1 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:1 STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y1; +--:-:-:-:1 STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y1; +--:-:-:-:1 STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y2; +--:-:-:-:1 STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y2; +--:-:-:-:1 STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y3; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_O; +--:-:-:-:0 IADD readOs, readOs, 4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_O; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:0 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:1 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:1 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:1 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:1 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:1 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:1 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:1 STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y4; +--:-:-:-:1 STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y4; +--:-:-:-:1 STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y5; +--:-:-:-:1 STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y5; +--:-:-:-:1 STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y6; +--:-:-:-:1 STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y6; +--:-:-:-:1 STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y7; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:0 IADD readOs, readOs, -4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_O; +--:-:-:-:0 IADD readOs, readOs, 4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_O; + +--:-:-:-:5 EXIT; + +STORE_O: + +--:-:-:-:2 ISETP.LT.AND P0, PT, k00, param_K, P4; // k00 < K && n < N +--:-:-:-:2 ISETP.LT.AND P1, PT, k04, param_K, P4; // k04 < K && n < N +--:-:-:-:2 ISETP.LT.AND P2, PT, k08, param_K, P4; // k08 < K && n < N +--:-:-:-:1 ISETP.LT.AND P3, PT, k12, param_K, P4; // k12 < K && n < N +[+ + our ($beta, $brelu, $bprelu, $dshift, $dtype); + return $beta || $brelu || $bprelu ? qq{ + +01:-:-:-:1 LEA Out00_0.CC, offset, param_X[0], $dshift; +--:-:-:-:1 LEA.HI.X Out00_1, offset, param_X[1], RZ, $dshift; +--:-:-:-:1 IADD Out04_0.CC, Out00_0, MPQN4; +--:-:-:-:1 IADD.X Out04_1, Out00_1, RZ; +--:-:-:-:1 IADD Out08_0.CC, Out04_0, MPQN4; +--:-:-:-:1 IADD.X Out08_1, Out04_1, RZ; +--:-:-:-:1 IADD Out12_0.CC, Out08_0, MPQN4; +--:-:-:-:1 IADD.X Out12_1, Out08_1, RZ; + +--:-:-:-:1 \@P0 LDG.E.CI.$dtype b00, [Out00_0]; +--:-:-:-:1 \@!P0 MOV b00, RZ; +--:-:5:-:1 \@P1 LDG.E.CI.$dtype b04, [Out04_0]; +--:-:-:-:1 \@!P1 MOV b04, RZ; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype b08, [Out08_0]; +--:-:-:-:1 \@!P2 MOV b08, RZ; +--:-:6:-:1 \@P3 LDG.E.CI.$dtype b12, [Out12_0]; +--:-:-:-:1 \@!P3 MOV b12, RZ; + + + } : ''; ++] +[+ + our $bias; + return $bias ? q{ + +20:-:-:-:1 LEA Sum00_0.CC, k00, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum00_1, k00, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum04_0.CC, k04, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum04_1, k04, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum08_0.CC, k08, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum08_1, k08, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum12_0.CC, k12, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum12_1, k12, param_Sum[1], RZ, 2; + +--:-:-:-:1 @P0 LDG.E.CI b00, [Sum00_0]; +--:-:-:-:1 @!P0 MOV b00, RZ; +--:-:5:-:1 @P1 LDG.E.CI b04, [Sum04_0]; +--:-:-:-:1 @!P1 MOV b04, RZ; +--:-:-:-:1 @P2 LDG.E.CI b08, [Sum08_0]; +--:-:-:-:1 @!P2 MOV b08, RZ; +--:-:6:-:1 @P3 LDG.E.CI b12, [Sum12_0]; +--:-:-:-:1 @!P3 MOV b12, RZ; + + + } : ''; ++] +--:-:-:-:1 LDS o00_0, [readOs + 4x< 0*128 + 0*32 + 0*16>]; +--:-:-:-:1 LDS o00_1, [readOs + 4x< 0*128 + 1*32 + 0*16>]; +--:-:-:-:1 LDS o00_2, [readOs + 4x< 0*128 + 2*32 + 0*16>]; +--:-:1:Y:1 LDS o00_3, [readOs + 4x< 0*128 + 3*32 + 0*16>]; +--:-:-:-:1 LDS o04_0, [readOs + 4x< 4*128 + 0*32 + 1*16>]; +--:-:-:-:1 LDS o04_1, [readOs + 4x< 4*128 + 1*32 + 1*16>]; +--:-:-:-:1 LDS o04_2, [readOs + 4x< 4*128 + 2*32 + 1*16>]; +--:-:2:Y:1 LDS o04_3, [readOs + 4x< 4*128 + 3*32 + 1*16>]; +--:-:-:-:1 LDS o08_0, [readOs + 4x< 8*128 + 0*32 + 2*16>]; +--:-:-:-:1 LDS o08_1, [readOs + 4x< 8*128 + 1*32 + 2*16>]; +--:-:-:-:1 LDS o08_2, [readOs + 4x< 8*128 + 2*32 + 2*16>]; +--:-:3:Y:1 LDS o08_3, [readOs + 4x< 8*128 + 3*32 + 2*16>]; +--:-:-:-:1 LDS o12_0, [readOs + 4x<12*128 + 0*32 + 3*16>]; +--:-:-:-:1 LDS o12_1, [readOs + 4x<12*128 + 1*32 + 3*16>]; +--:-:-:-:1 LDS o12_2, [readOs + 4x<12*128 + 2*32 + 3*16>]; +--:-:4:Y:1 LDS o12_3, [readOs + 4x<12*128 + 3*32 + 3*16>]; + + +01:-:-:-:1 FADD o00_0, o00_0, o00_1; +--:-:-:-:1 FADD o00_2, o00_2, o00_3; +02:-:-:-:1 FADD o04_0, o04_0, o04_1; +--:-:-:-:1 FADD o04_2, o04_2, o04_3; +04:-:-:-:1 FADD o08_0, o08_0, o08_1; +--:-:-:-:1 FADD o08_2, o08_2, o08_3; +08:-:-:-:1 FADD o12_0, o12_0, o12_1; +--:-:-:-:1 FADD o12_2, o12_2, o12_3; + +--:-:-:-:1 FADD out00, o00_0, o00_2; +--:-:-:-:1 FADD out04, o04_0, o04_2; +--:-:-:-:1 FADD out08, o08_0, o08_2; +--:-:-:-:3 FADD out12, o12_0, o12_2; +[+ + our $bias; return $bias ? q{ +10:-:-:-:1 FADD out00, out00, b00; +--:-:-:-:1 FADD out04, out04, b04; +20:-:-:-:1 FADD out08, out08, b08; +--:-:-:-:1 FADD out12, out12, b12; + } : ''; ++] +[+ + our $relu; return $relu ? q{ +// maximum(x, 0) +--:-:-:-:1 FMNMX out00, out00, RZ, !PT; +--:-:-:-:1 FMNMX out04, out04, RZ, !PT; +--:-:-:-:1 FMNMX out08, out08, RZ, !PT; +--:-:-:-:1 FMNMX out12, out12, RZ, !PT; + } : ''; ++] +[+ + our $prelu; return $prelu ? q{ +// maximum(x, 0) + slope * minimum(0, x) +--:-:-:-:1 FMNMX b00, out00, RZ, !PT; +--:-:-:-:1 FMNMX b04, out04, RZ, !PT; +--:-:-:-:1 FMNMX b08, out08, RZ, !PT; +--:-:-:-:1 FMNMX b12, out12, RZ, !PT; + +--:-:-:-:1 FMNMX x00, out00, RZ, PT; +--:-:-:-:1 FMNMX x04, out04, RZ, PT; +--:-:-:-:1 FMNMX x08, out08, RZ, PT; +--:-:-:-:1 FMNMX x12, out12, RZ, PT; + +--:-:-:-:1 FFMA out00, x00, param_beta, b00; +--:-:-:-:1 FFMA out04, x04, param_beta, b04; +--:-:-:-:1 FFMA out08, x08, param_beta, b08; +--:-:-:-:1 FFMA out12, x12, param_beta, b12; + } : ''; ++] + + +[+ + our ($beta, $brelu, $bprelu, $convert_in); + return $convert_in && ($beta || $brelu || $bprelu) ? qq{ +10:-:1:-:1 \@P0 $convert_in b00, b00; +--:-:2:-:1 \@P1 $convert_in b04, b04; +20:-:3:-:1 \@P2 $convert_in b08, b08; +--:-:4:-:1 \@P3 $convert_in b12, b12; + } : ''; ++] +[+ + our $beta; return $beta ? q{ +11:-:-:-:1 FFMA out00, b00, param_beta, out00; +02:-:-:-:1 FFMA out04, b04, param_beta, out04; +24:-:-:-:1 FFMA out08, b08, param_beta, out08; +08:-:-:-:1 FFMA out12, b12, param_beta, out12; + } : ''; ++] +[+ + our $brelu; return $brelu ? q{ +//delta *= x > 0 +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +11:-:-:-:1 FSETP.GT.AND P0, PT, b00, RZ, PT; +02:-:-:-:1 FSETP.GT.AND P1, PT, b04, RZ, PT; +24:-:-:-:1 FSETP.GT.AND P2, PT, b08, RZ, PT; +08:-:-:-:1 FSETP.GT.AND P3, PT, b12, RZ, PT; +--:-:-:-:1 @!P0 MOV out00, RZ; +--:-:-:-:1 @!P1 MOV out04, RZ; +--:-:-:-:1 @!P2 MOV out08, RZ; +--:-:-:-:1 @!P3 MOV out12, RZ; +--:-:-:Y:d R2P PR, preds, 0x0f; + + } : ''; ++] +[+ + our $bprelu; return $bprelu ? q{ +//delta *= ((x > 0) + slope * (x < 0)) +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +11:-:-:-:1 FSETP.GT.AND P0, PT, b00, RZ, PT; +02:-:-:-:1 FSETP.GT.AND P1, PT, b04, RZ, PT; +24:-:-:-:1 FSETP.GT.AND P2, PT, b08, RZ, PT; +08:-:-:-:1 FSETP.GT.AND P3, PT, b12, RZ, PT; +--:-:-:-:1 SEL x00, one, RZ, P0; +--:-:-:-:1 SEL x04, one, RZ, P1; +--:-:-:-:1 SEL x08, one, RZ, P2; +--:-:-:-:1 SEL x12, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b00, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b04, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b08, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b12, RZ, PT; +--:-:-:-:1 SEL b00, one, RZ, P0; +--:-:-:-:1 SEL b04, one, RZ, P1; +--:-:-:-:1 SEL b08, one, RZ, P2; +--:-:-:-:1 SEL b12, one, RZ, P3; +--:-:-:-:1 R2P PR, preds, 0x0f; +--:-:-:-:1 FFMA b00, b00, param_beta, x00; +--:-:-:-:1 FFMA b04, b04, param_beta, x04; +--:-:-:-:1 FFMA b08, b08, param_beta, x08; +--:-:-:-:1 FFMA b12, b12, param_beta, x12; +--:-:-:-:1 FMUL out00, out00, b00; +--:-:-:-:1 FMUL out04, out04, b04; +--:-:-:-:1 FMUL out08, out08, b08; +--:-:-:-:2 FMUL out12, out12, b12; + } : ''; ++] +[+ + our $bsum; return $bsum ? q{ +20:-:-:-:1 SEL sum00, out00, RZ, P0; +--:-:-:-:1 SEL sum04, out04, RZ, P1; +--:-:-:-:1 SEL sum08, out08, RZ, P2; +--:-:-:-:1 SEL sum12, out12, RZ, P3; + } : ''; ++] + +[+ + our $convert_out; return $convert_out ? qq{ +--:-:1:-:1 \@P0 $convert_out out00, out00; +--:-:2:-:1 \@P1 $convert_out out04, out04; +--:-:3:-:1 \@P2 $convert_out out08, out08; +--:-:4:-:1 \@P3 $convert_out out12, out12; + } : ''; ++] + + +--:-:-:-:1 LEA Out00_0.CC, offset, param_O[0], [+ dshift() +]; +--:-:-:-:1 LEA.HI.X Out00_1, offset, param_O[1], RZ, [+ dshift() +]; +--:-:-:-:1 IADD Out04_0.CC, Out00_0, MPQN4; +--:-:-:-:1 IADD.X Out04_1, Out00_1, RZ; +--:-:-:-:1 IADD Out08_0.CC, Out04_0, MPQN4; +--:-:-:-:1 IADD.X Out08_1, Out04_1, RZ; +--:-:-:-:1 IADD Out12_0.CC, Out08_0, MPQN4; +--:-:-:-:1 IADD.X Out12_1, Out08_1, RZ; + +01:-:-:-:1 @P0 STG.E.CG.[+ dtype() +] [Out00_0], out00; +02:-:-:-:1 @P1 STG.E.CG.[+ dtype() +] [Out04_0], out04; +04:-:-:-:1 @P2 STG.E.CG.[+ dtype() +] [Out08_0], out08; +08:1:-:-:1 @P3 STG.E.CG.[+ dtype() +] [Out12_0], out12; + + +[+ + our $bsum; return $bsum ? q{ + +--:-:-:-:1 XMAD.LO2C bsum00, k00, param_gridMPQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum04, k04, param_gridMPQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum08, k08, param_gridMPQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum12, k12, param_gridMPQN, bsum_offset; +--:-:-:-:1 LEA Sum00_0.CC, bsum00, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum00_1, bsum00, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum04_0.CC, bsum04, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum04_1, bsum04, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum08_0.CC, bsum08, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum08_1, bsum08, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum12_0.CC, bsum12, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum12_1, bsum12, param_Sum[1], RZ, 2; +--:-:-:-:1 ISETP.LT.AND P0, PT, k00, param_K, P6; // k00 < K && tid31 == 0 +--:-:-:-:1 ISETP.LT.AND P1, PT, k04, param_K, P6; // k04 < K && tid31 == 0 +--:-:-:-:1 ISETP.LT.AND P2, PT, k08, param_K, P6; // k08 < K && tid31 == 0 +--:-:-:-:1 ISETP.LT.AND P3, PT, k12, param_K, P6; // k12 < K && tid31 == 0 + +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 1, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 1, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 1, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:1 FADD sum12, x12, sum12; +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 2, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 2, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 2, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:1 FADD sum12, x12, sum12; +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 4, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 4, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 4, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:1 FADD sum12, x12, sum12; +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 8, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 8, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 8, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:1 FADD sum12, x12, sum12; +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 16, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 16, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 16, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 16, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:0 FADD sum12, x12, sum12; + + +--:-:-:-:1 @P0 STG.E.CG [Sum00_0], sum00; +--:-:-:-:1 @P1 STG.E.CG [Sum04_0], sum04; +--:-:-:-:1 @P2 STG.E.CG [Sum08_0], sum08; +--:6:-:-:1 @P3 STG.E.CG [Sum12_0], sum12; + } : ''; ++] + +--:-:-:-:1 IADD k00, k00, 16; +--:-:-:-:1 IADD k04, k04, 16; +--:-:-:-:1 IADD k08, k08, 16; +--:-:-:-:1 IADD k12, k12, 16; +--:-:-:-:0 IADD offset, offset, MPQN16; + +--:-:-:-:5 RET; \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_2x2_3x3_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_2x2_3x3_32x32.sass new file mode 100644 index 0000000..a8a1ef4 --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_winograd_2x2_3x3_32x32.sass @@ -0,0 +1,1568 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $type; +our $dtype = $type eq 'h' ? 'U16' : '32'; +our $dshift = $type eq 'h' ? '1' : '2'; +our $dsize = $type eq 'h' ? '2' : '4'; +our $vsize = $type eq 'h' ? '64' : '128'; + +sub dtype { return $dtype; } +sub dsize { return $dsize; } +sub dshift { return $dshift; } + +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +-] + + + param_S[0] : c[0x0][0x140] + param_S[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_C : c[0x0][0x174] + param_H : c[0x0][0x178] + param_P : c[0x0][0x17c] + param_pad_h : c[0x0][0x180] + param_pad_w : c[0x0][0x184] + param_HWN : c[0x0][0x188] + param_WN : c[0x0][0x18c] + param_PQN : c[0x0][0x190] + param_QN : c[0x0][0x194] + param_Qnk : c[0x0][0x198] + param_nk : c[0x0][0x19c] + param_n : c[0x0][0x1a0] + param_k : c[0x0][0x1a4] + param_magic_Qnk : c[0x0][0x1a8] + param_shift_Qnk : c[0x0][0x1ac] + param_magic_nk : c[0x0][0x1b0] + param_shift_nk : c[0x0][0x1b4] + param_magic_k : c[0x0][0x1b8] + param_shift_k : c[0x0][0x1bc] + param_RSK : c[0x0][0x1c0] + param_4RSKp : c[0x0][0x1c4] + param_4HWNp : c[0x0][0x1c8] + param_gridK : c[0x0][0x1cc] + param_gridP2 : c[0x0][0x1d0] + param_gridQ : c[0x0][0x1d4] + param_gridN : c[0x0][0x1d8] + param_gridQN : c[0x0][0x1dc] + param_gridPQN : c[0x0][0x1e0] + param_superP : c[0x0][0x1e4] + param_superQ : c[0x0][0x1e8] + param_superN : c[0x0][0x1ec] + param_shiftP : c[0x0][0x1f0] + param_shiftQ : c[0x0][0x1f4] + param_shiftN : c[0x0][0x1f8] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 64-79 ~ tid, idx_P, idx_Q, idx_N, idx_K, idx_n, idx_k, tid16, tid31, c, addr_zero, partialC + 80-119 ~ tid1, idx_PQnk, idx_Qnk, idx_nk, magic_Qnk, neg_Qnk, neg_nk, neg_k, div<1-3>, idx_P2, idx_Q2, z<1-2>, negOne, super_P, super_Q + 80-95 ~ super_N, y, x, ti, ti_sign, x<1-3>, mask_x, preds1, offsetIC + 80-95 ~ tf, tid31_4, offsetFC + + 120-121 : track<0-1> + 122-127 ~ writeS, readFs, readIs, C, preds, idx_nkpq + + 80-95 ~ p, q, n, tid32, tid64, tid_16, tid_1, q2, p2, to, superP, superQ, superN + 96-99 : Out<0-1>, Sum<0-1> + 100-121 ~ alpha, one, writeCs, readCs, k, PQN15, tid_31, out_offset, bsum_offset + + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + + 64-79 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3> + 80-95 : t0<0-1>, t1<0-1>, t2<0-1>, t3<0-1> + + 3,2,11,10,19,18,27,26,1,0,9,8,17,16,25,24 ~ b<00|01|10|11>, x<00|01|10|11>, sum<0|1>, s0<0-1>, s1<0-1> + + // Image registers (registers assigned to avoid bank conflicts) + 96 = i00 + 97 = i01 + 98 = i02 + 99 = i03 + 100 = i30 + 101 = i31 + 102 = i32 + 103 = i33 + 105 = i13 + 104 = i12 + 107 = i11 + 106 = i10 + 108 = i23, TI23, I23 + 109 = i22, TI22 + 110 = i21, TI21 + 111 = i20, TI20, I20 + 113 = TI00, I00, TI10, I10, I21, I01 + 112 = TI01, I11 + 115 = TI02, I12 + 114 = TI03, I03, TI11, I31 + 116 = TI30, I30, TI12, I32 + 117 = TI31 + 118 = TI32 + 119 = TI33, I33, TI13, I13, I22, I02 + // Filter registers +[+ + our $FX; + return $FX ? q{ + 104-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3> + } : q{ + 96 = f00, TF00, F00 + 97 = f01, TF01 + 98 = f02, TF02, F03 + 99 = f10 + 100 = f11 + 101 = f12 + 102 = f20, TF30, F30 + 103 = f21, TF31 + 104 = f22, TF32, F33 + 105 = tb3, F32 + 106 = tb0, F02 + 107 = ta2, TF22, F23 + 108 = ta0, TF20, F20 + 109 = ta1, TF21 + 110 = F01 + 111 = F31 + 112 = TF10, F10 + 113 = TF11 + 114 = TF12, F13 + 115 = tb1, F12 + 116 = tb2, F22 + 117 = F11 + 118 = F21 + }; ++] + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_PQnk, SR_CTAID.X; + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 128, PT; + +--:-:-:-:1 LOP.AND tid31, tid, 31; + +// c = (tid & 127) / 32 +--:-:-:-:1 BFE.U32 c, tid, 0x205; // 2 bits at position 5 + +--:-:-:-:1 SHL addr_zero, tid31, 4; +--:-:-:-:1 ISCADD addr_zero, c, addr_zero, 11; +--:-:-:-:1 @P0 IADD addr_zero, addr_zero, 4x<512*4>; + +--:-:-:-:1 STS.128 [addr_zero + 4x<00*4>], RZ; +--:-:-:-:1 STS.128 [addr_zero + 4x<32*4>], RZ; +--:-:-:-:1 STS.128 [addr_zero + 4x<64*4>], RZ; +--:-:-:-:1 STS.128 [addr_zero + 4x<96*4>], RZ; + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + + +// idx_P2 = idx_PQnk / blk_Qnk +--:-:-:-:1 MOV magic_Qnk, param_magic_Qnk; +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_Qnk, 1, PT; +02:-:-:-:1 @P1 XMAD div1, idx_PQnk, magic_Qnk, RZ; +--:-:-:-:1 @P1 XMAD div2, idx_PQnk, magic_Qnk.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, idx_PQnk.H1, magic_Qnk.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk, div1; +--:-:-:-:1 @P1 IADD3.RS idx_P2, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 idx_P2, idx_P2, param_shift_Qnk; +--:-:-:-:1 @!P1 SHR.U32 idx_P2, idx_PQnk, param_shift_Qnk; + +// idx_Qnk = idx_PQnk % blk_Qnk +--:-:-:-:1 IADD neg_Qnk, RZ, -param_Qnk; +--:-:-:-:1 XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk; + +// idx_Q2 = idx_Qnk / nk +--:-:-:-:1 XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ; +--:-:-:-:1 SHR.U32 idx_Q2, idx_Q2, param_shift_nk; +// idx_nk = idx_Qnk % nk +--:-:-:-:1 IADD neg_nk, RZ, -param_nk; +--:-:-:-:1 XMAD.S16.U16 idx_nk, neg_nk, idx_Q2, idx_Qnk; + +// idx_n = idx_nk / k +--:-:-:-:1 XMAD idx_n, idx_nk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_n, idx_n, param_shift_k; +// idx_k = idx_nk % k +--:-:-:-:1 IADD neg_k, RZ, -param_k; +--:-:-:-:1 XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk; + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// idx_P = idx_P2 * 2 +// idx_Q = idx_Q2 +// if idx_P2 != gridP2: +// idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1) +// idx_Q = idx_Q2 >> 1 +--:-:-:-:1 ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT; +--:-:-:-:1 SHL idx_P, idx_P2, 1; +--:-:-:-:1 @P1 LOP.AND z1, idx_Q2, 1; +--:-:-:-:1 @P1 BFE.U32 z2, idx_Q2, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P1 LOP.XOR z1, z1, z2; +--:-:-:-:1 @P1 IADD idx_P, idx_P, z1; +--:-:-:-:1 @P1 SHR.U32 idx_Q, idx_Q2, 1; +--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2; + +// Scan backwards on odd rows +// if idx_P2 & 1: +// idx_Q = gridQ - idx_Q - 1 +--:-:-:-:1 LOP.AND.NZ P2, RZ, idx_P2, 1; +--:-:-:-:1 MOV negOne, -1; +--:-:-:-:1 @P2 IADD3 idx_Q, -idx_Q, param_gridQ, negOne; + +--:-:-:-:1 BFI idx_nkpq, idx_P, 0x0c0c, idx_Q; +--:-:-:-:1 BFI idx_nkpq, idx_k, 0x0418, idx_nkpq; +--:-:-:-:1 BFI idx_nkpq, idx_n, 0x041c, idx_nkpq; + +// x = grid_x << shiftX +// y = grid_y << shiftY +--:-:-:-:1 SHL idx_P, idx_P, param_shiftP; +--:-:-:-:1 SHL idx_Q, idx_Q, param_shiftQ; + +// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp +--:-:-:-:1 BFE.U32 super_P, tid, param_superP; +--:-:-:-:1 BFE.U32 super_Q, tid, param_superQ; +--:-:-:-:1 ISCADD idx_P, super_P, idx_P, 1; +--:-:-:-:1 ISCADD idx_Q, super_Q, idx_Q, 1; + +// If this value is not a multiple of 4 we want to grab the partial amount on the first fetch. +// If it is a multiple of 4 then make a full 4 line fetch. +--:-:-:-:1 MOV C, param_C; +--:-:-:-:1 LOP.AND.Z P6, partialC, C, 3; +--:-:-:-:1 @!P6 IADD3 C, C, 4, -partialC; +--:-:-:-:1 @P6 MOV partialC, 4; +// P6 = c < partialC +--:-:-:-:1 ISETP.LT.AND P6, PT, c, partialC, PT; + +[+ + our $FX; return $FX ? '' : q{ +// writeS = c*512 + tid & 31 +--:-:-:-:1 ISCADD writeS, c, tid31, 9; +--:-:-:-:1 ISCADD writeS, writeS, 4x<512*4*2>, 2; + } ++] + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readFs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readIs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid16; +--:-:-:-:1 SHL readIs, readIs, 4; + +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 LOP3.LUT readFs, readFs, tid16, tid1, 0xfe; +--:-:-:-:1 ISCADD readFs, readFs, 4x<512*4>, 4; + + +--:-:-:-:5 @P0 BRA.U FILTER_SETUP; + +--:-:1:-:2 S2R idx_N, SR_CTAID.Z; + + + + +// writeS = c*512 + tid & 31 +[+ + our $FX; + return $FX ? q{ +--:-:-:-:1 ISCADD writeS, c, tid31, 9; +--:-:-:-:1 ISCADD writeS, writeS, 4x<512*4*2>, 2; + } : ''; ++] + +--:-:-:-:1 LOP.AND super_N, tid, param_superN; + +01:-:-:-:1 XMAD idx_N, idx_N, param_n, idx_n; +--:-:-:-:1 SHL idx_N, idx_N, param_shiftN; +--:-:-:-:1 IADD idx_N, idx_N, super_N; + +// n < N +--:-:-:-:1 ISETP.LT.AND P5, PT, idx_N, 1x<$N>, PT; + +// Subtract off the padding +--:-:-:-:1 IADD y, idx_P, -param_pad_h; +--:-:-:-:1 IADD x, idx_Q, -param_pad_w; + +// a0 = n + x*N + y*XN + c*YXN +--:-:-:-:1 XMAD.S16.U16 ti, x, 1x<$N>, idx_N; +--:-:-:-:1 XMAD.S16.U16.LO2C ti, y, param_WN, ti; +--:-:-:-:1 XMAD.S16.U16.LO2C ti, c, param_HWN, ti; +--:-:-:-:1 ISET.LT.AND ti_sign, ti, RZ, PT; +--:-:-:-:1 LEA track0.CC, ti, param_I[0], [+ dshift() +]; +--:-:-:-:1 IADD.X track1, ti_sign, param_I[1]; + +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 IADD x2, x, 2; +--:-:-:-:1 IADD x3, x, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x, 1x<$W>, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, 1x<$W>, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, 1x<$W>, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, 1x<$W>, PT; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x0f; + +--:-:-:-:1 IADD x1, y, 1; +--:-:-:-:1 IADD x2, y, 2; +--:-:-:-:1 IADD x3, y, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, y, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_H, P5; +--:-:-:-:1 ISETP.GE.AND P0, PT, y, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; + +--:-:-:-:1 SEL preds, mask_x, RZ, P0; +--:-:-:-:1 @P1 BFI preds, mask_x, 0x404, preds; +--:-:-:-:1 @P2 BFI preds, mask_x, 0x408, preds; +--:-:-:-:1 @P3 BFI preds, mask_x, 0x40c, preds; + +// For partial C on first load +--:-:-:-:1 SEL preds1, preds, RZ, P6; + +// offsetIC = partialC*YXN +--:-:-:-:1 XMAD.LO2C offsetIC, partialC, param_HWN, RZ; + +--:-:-:-:1 R2P PR, preds1, 0x0f; +--:-:-:-:1 SHF.R.U64 preds1, preds1, 12, preds1; + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>]; +--:-:-:-:1 @!P0 MOV i00, RZ; +--:-:-:-:1 @!P1 MOV i01, RZ; +--:-:-:-:1 @!P2 MOV i02, RZ; +--:-:-:-:1 @!P3 MOV i03, RZ; + +--:-:-:-:1 R2P PR, preds1, 0x0f; +--:-:-:-:1 SHF.L.U64 preds1, preds1, 8, preds1; + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>]; +--:-:-:-:1 @!P0 MOV i30, RZ; +--:-:-:-:1 @!P1 MOV i31, RZ; +--:-:-:-:1 @!P2 MOV i32, RZ; +--:-:-:-:1 @!P3 MOV i33, RZ; + +--:-:-:-:1 R2P PR, preds1, 0x0f; +--:-:-:-:1 SHF.R.U64 preds1, preds1, 4, preds1; + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>]; +--:-:-:-:1 @!P0 MOV i10, RZ; +--:-:-:-:1 @!P1 MOV i11, RZ; +--:-:-:-:1 @!P2 MOV i12, RZ; +--:-:-:-:1 @!P3 MOV i13, RZ; + +--:-:-:-:1 R2P PR, preds1, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>]; +--:6:2:-:1 @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>]; +--:-:-:-:1 @!P0 MOV i20, RZ; +--:-:-:-:1 @!P1 MOV i21, RZ; +--:-:-:-:1 @!P2 MOV i22, RZ; +--:-:-:-:1 @!P3 MOV i23, RZ; + + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>]; + +20:-:-:-:6 LEA track0.CC, offsetIC, track0, [+ dshift() +]; +--:-:-:-:0 LEA.HI.X track1, offsetIC, track1, RZ, [+ dshift() +]; + +--:-:-:-:5 BRA.U IMAGE_LOOP; + + + +FILTER_SETUP: + +--:-:1:-:2 S2R idx_K, SR_CTAID.Y; + + +01:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +[+ + our ($dtype, $dshift, $FX, $K, $vsize, $dsize); + return $FX ? qq{ + +// writeS = (c*512 + (tid & 31)*4)*4 +--:-:-:-:1 SHL writeS, tid31, 4; +--:-:-:-:1 ISCADD writeS, c, writeS, 11; +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<512*4*2>; + +// (kBlks,C,4,4,32) +// offset = idx_K*C*512 + c*512 + tid31*4; +--:-:-:-:1 SHL tid31_4, tid31, 2; +--:-:-:-:1 XMAD tf, idx_K, param_C, c; +--:-:-:-:1 ISCADD tf, tf, tid31_4, 9; +--:-:-:-:1 LEA track0.CC, tf, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1, tf, param_F[1], RZ, $dshift; + +// offsetFC = partialC*512 +--:-:-:-:1 SHL offsetFC, partialC, 9; + +--:-:-:-:1 \@!P6 LDS.U.$vsize F0, [addr_zero]; +--:-:-:-:1 \@!P6 LDS.U.$vsize F1, [addr_zero]; +--:-:-:-:1 \@!P6 LDS.U.$vsize F2, [addr_zero]; +--:-:-:-:1 \@!P6 LDS.U.$vsize F3, [addr_zero]; + +--:-:2:-:1 \@P6 LDG.E.CG.$vsize F0, [track + 4x<00 * $dsize>]; +--:-:3:-:1 \@P6 LDG.E.CG.$vsize F1, [track + 4x<32 * $dsize>]; +--:-:4:-:1 \@P6 LDG.E.CG.$vsize F2, [track + 4x<64 * $dsize>]; +--:6:5:-:1 \@P6 LDG.E.CG.$vsize F3, [track + 4x<96 * $dsize>]; + + + } : qq{ +// k = idx_K*32 + tid & 31 +--:-:-:-:1 ISCADD idx_K, idx_K, tid31, 5; +--:-:-:-:1 ISETP.LT.AND P0, PT, idx_K, 1x<$K>, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, idx_K, 1x<$K>, PT; + +// offsetFC = partialC * RSK +--:-:-:-:1 XMAD.LO2C offsetFC, partialC, param_RSK, RZ; + +// a0 = k + c*RSK +--:-:-:-:1 XMAD.LO2C tf, c, param_RSK, idx_K; + +--:-:-:-:1 LEA track0.CC, tf, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1, tf, param_F[1], RZ, $dshift; + +--:-:-:-:1 \@!P0 MOV f00, RZ; +--:-:-:-:1 \@!P0 MOV f01, RZ; +--:-:-:-:1 \@!P0 MOV f02, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>]; +--:-:-:-:1 \@!P0 MOV f20, RZ; +--:-:-:-:1 \@!P0 MOV f21, RZ; +--:-:-:-:1 \@!P0 MOV f22, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>]; +--:-:-:-:1 \@!P0 MOV f10, RZ; +--:-:-:-:1 \@!P0 MOV f11, RZ; +--:-:-:-:1 \@!P0 MOV f12, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>]; +--:6:2:-:1 \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>]; + }; ++] + + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>]; + +20:-:-:-:6 LEA track0.CC, offsetFC, track0, [+ dshift() +]; +--:-:-:-:0 LEA.HI.X track1, offsetFC, track1, RZ, [+ dshift() +]; + +--:-:-:-:5 BRA.U FILTER_LOOP; + + +IMAGE_LOOP: +--:-:-:-:1 ISETP.GT.AND P6, PT, C, 4, PT; +[+ + our ($dtype, $dsize, $convert_in, $W, $N); + my %insert = ( + + j0c1 => "--:-:-:-:1 ISETP.GT.AND P5, PT, C, RZ, PT;\n" . + "--:-:-:-:1 IADD C, C, -4;\n", + + + j0c14 => "--:-:-:-:1 R2P PR, preds, 0x0f;\n", + j0c16 => "--:-:-:-:1 \@P6 SHF.R.U64 preds, preds, 12, preds;\n", + + $convert_in ? ( + j0c3 => "02:-:-:-:1 $convert_in i00, i00;\n", + j0c5 => "--:-:-:-:1 $convert_in i01, i01;\n", + j0c7 => "--:-:-:-:1 $convert_in i02, i02;\n", + j0c9 => "--:-:-:-:0 \@!P6 MOV preds, RZ;\n" . + "--:-:-:-:1 $convert_in i03, i03;\n", + + j0c11 => "--:-:-:-:1 $convert_in i20, i20;\n", + j0c13 => "--:-:-:-:1 $convert_in i21, i21;\n", + j0c15 => "--:-:-:-:1 $convert_in i22, i22;\n", + j0c17 => "--:-:2:-:1 $convert_in i23, i23;\n", + + j0c19 => "--:-:-:-:1 $convert_in i10, i10;\n", + j0c21 => "--:-:-:-:1 $convert_in i11, i11;\n", + j0c23 => "--:-:-:-:1 $convert_in i12, i12;\n", + j0c25 => "--:-:-:-:1 $convert_in i13, i13;\n", + + j0c27 => "--:-:-:-:1 $convert_in i30, i30;\n", + j0c29 => "--:-:-:-:1 $convert_in i31, i31;\n", + j0c31 => "--:-:-:-:1 $convert_in i32, i32;\n", + j0c33 => "--:-:3:-:1 $convert_in i33, i33;\n", + ) : ( + j0c9 => "--:-:-:-:1 \@!P6 MOV preds, RZ;\n", + ), + + j0c32 => "02:-:-:-:1 \@P5 FADD TI00, i00, -i20;\n" . + "--:-:-:-:1 \@P5 FADD TI01, i01, -i21;\n" . + "--:-:-:-:1 \@P5 FADD TI02, i02, -i22;\n" . + "--:-:-:-:1 \@P5 FADD TI03, i03, -i23;\n", + + j0c35 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n", + j0c37 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n", + j0c39 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype i02, [track + ${dsize}x<0*$W*$N + 2*$N>];\n", + j0c41 => "--:-:-:-:1 \@P3 LDG.E.CI.$dtype i03, [track + ${dsize}x<0*$W*$N + 3*$N>];\n", + j0c43 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i00, RZ;\n", + j0c45 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i01, RZ;\n", + j0c47 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i02, RZ;\n", + j0c49 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i03, RZ;\n" . + "--:-:-:-:1 R2P PR, preds, 0x0f;\n", + + j0c50 => "--:-:-:-:1 \@P6 SHF.L.U64 preds, preds, 8, preds;\n", + + j0c55 => "04:-:-:-:1 \@P5 FADD TI30, i10, -i30;\n" . + "--:-:-:-:1 \@P5 FADD TI31, i11, -i31;\n" . + "--:-:-:-:1 \@P5 FADD TI32, i12, -i32;\n" . + "--:-:-:-:1 \@P5 FADD TI33, i13, -i33;\n", + + j0c57 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype i30, [track + ${dsize}x<3*$W*$N + 0*$N>];\n", + j0c59 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype i31, [track + ${dsize}x<3*$W*$N + 1*$N>];\n", + j0c61 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype i32, [track + ${dsize}x<3*$W*$N + 2*$N>];\n", + j0c63 => "--:-:-:-:1 \@P3 LDG.E.CI.$dtype i33, [track + ${dsize}x<3*$W*$N + 3*$N>];\n", + j1c1 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i30, RZ;\n", + j1c3 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i31, RZ;\n", + j1c5 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i32, RZ;\n", + j1c7 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i33, RZ;\n" . + "--:-:-:-:1 R2P PR, preds, 0x0f;\n" . + "--:-:-:-:1 \@P5 FADD I00, TI00, -TI02;\n" . + "--:-:-:-:1 \@P5 FADD I03, TI01, -TI03;\n" . + "--:-:-:-:1 \@P5 FADD I30, TI30, -TI32;\n" . + "--:-:-:-:1 \@P5 FADD I33, TI31, -TI33;\n" . + "--:-:-:-:1 \@P6 SHF.R.U64 preds, preds, 4, preds;\n", + + j1c9 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*4 + 0)>], I00;\n", + j1c11 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*4 + 3)>], I03;\n", + j1c13 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*4 + 0)>], I30;\n", + j1c15 => "--:3:-:-:1 \@P5 STS [writeS + 4x<32*(3*4 + 3)>], I33;\n", + + + j1c29 => "04:-:-:-:1 \@P5 FADD TI10, i10, i20;\n" . + "--:-:-:-:1 \@P5 FADD TI20, -i10, i20;\n" . + "--:-:-:-:1 \@P5 FADD TI11, i11, i21;\n" . + "--:-:-:-:1 \@P5 FADD TI21, -i11, i21;\n" . + "--:-:-:-:1 \@P5 FADD TI12, i12, i22;\n" . + "--:-:-:-:1 \@P5 FADD TI22, -i12, i22;\n" . + "--:-:-:-:1 \@P5 FADD TI13, i13, i23;\n" . + "--:-:-:-:1 \@P5 FADD TI23, -i13, i23;\n", + + j1c30 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n", + j1c32 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n", + j1c34 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype i12, [track + ${dsize}x<1*$W*$N + 2*$N>];\n", + j1c36 => "--:-:-:-:1 \@P3 LDG.E.CI.$dtype i13, [track + ${dsize}x<1*$W*$N + 3*$N>];\n", + j1c38 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i10, RZ;\n", + j1c40 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i11, RZ;\n", + j1c42 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i12, RZ;\n", + j1c44 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i13, RZ;\n" . + "--:-:-:-:1 R2P PR, preds, 0x0f;\n" . + "--:-:-:-:1 \@P5 FADD I10, TI10, -TI12;\n" . + "--:-:-:-:1 \@P5 FADD I20, TI20, -TI22;\n" . + "--:-:-:-:1 \@P5 FADD I13, TI11, -TI13;\n" . + "--:-:-:-:1 \@P5 FADD I23, TI21, -TI23;\n" . + "--:-:-:-:1 \@P6 SHF.L.U64 preds, preds, 8, preds;\n", + + j1c46 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*4 + 0)>], I10;\n", + j1c48 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*4 + 0)>], I20;\n", + j1c50 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*4 + 3)>], I13;\n", + j1c52 => "--:3:-:-:1 \@P5 STS [writeS + 4x<32*(2*4 + 3)>], I23;\n", + + + j2c8 => "04:-:-:-:1 \@P5 FADD I21, TI21, TI22;\n" . + "--:-:-:-:1 \@P5 FADD I22, -TI21, TI22;\n", + + j2c11 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*4 + 1)>], I21;\n", + j2c13 => "--:3:-:-:1 \@P5 STS [writeS + 4x<32*(2*4 + 2)>], I22;\n", + + j2c15 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype i20, [track + ${dsize}x<2*$W*$N + 0*$N>];\n", + j2c17 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype i21, [track + ${dsize}x<2*$W*$N + 1*$N>];\n", + j2c19 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype i22, [track + ${dsize}x<2*$W*$N + 2*$N>];\n", + j2c21 => "--:6:2:-:1 \@P3 LDG.E.CI.$dtype i23, [track + ${dsize}x<2*$W*$N + 3*$N>];\n", + j2c23 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i20, RZ;\n", + j2c25 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i21, RZ;\n", + j2c27 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i22, RZ;\n", + j2c29 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i23, RZ;\n", + + j2c30 => "04:-:-:-:1 \@P5 FADD I01, TI01, TI02;\n" . + "--:-:-:-:1 \@P5 FADD I02, -TI01, TI02;\n" . + "--:-:-:-:1 \@P5 FADD I11, TI11, TI12;\n" . + "--:-:-:-:1 \@P5 FADD I12, -TI11, TI12;\n" . + "--:-:-:-:1 \@P5 FADD I31, TI31, TI32;\n" . + "--:-:-:-:1 \@P5 FADD I32, -TI31, TI32;\n", + + j2c31 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*4 + 1)>], I01;\n", + j2c33 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*4 + 2)>], I02;\n", + j2c35 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*4 + 1)>], I11;\n", + j2c37 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*4 + 2)>], I12;\n", + j2c39 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*4 + 1)>], I31;\n", + j2c41 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*4 + 2)>], I32;\n", + + j2c62 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P5 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" . + "--:-:-:-:1 \@P5 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" . + "--:-:-:-:1 \@P5 LOP.XOR writeS, writeS, 4x<512*4*2>;\n", + + j3c57 => "20:-:-:-:1 \@P6 IADD track0.CC, track0, param_4HWNp;\n", + j3c62 => "--:-:-:-:1 \@P6 IADD.X track1, track1, RZ;\n", + + j3c63 => "--:-:-:Y:5 \@P5 BRA.U IMAGE_LOOP;\n" . + "--:-:-:Y:5 BRA.U END_LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 3) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 4; + my $rsPred = $j == 3 ? '@P5' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +FILTER_LOOP: +--:-:-:-:1 ISETP.GT.AND P0, PT, C, RZ, PT; +[+ + our ($dtype, $convert_in, $FX, $vsize, $dsize, $K); + my %insert = ( + + j0c1 => "--:-:-:-:1 ISETP.GT.AND P1, PT, C, 4, PT;\n" . + "--:-:-:-:1 IADD C, C, -4;\n", + + $FX ? ( + $convert_in ? ( + j1c8 => "02:-:-:-:1 \@P0 $convert_in F03, F01.H1;\n", + j1c12 => "--:-:-:-:1 \@P0 $convert_in F02, F01.H0;\n", + j1c16 => "--:-:-:-:1 \@P0 $convert_in F01, F00.H1;\n", + j1c20 => "--:-:2:-:1 \@P0 $convert_in F00, F00.H0;\n", + + j1c26 => "04:-:-:-:1 \@P0 $convert_in F13, F11.H1;\n", + j1c30 => "--:-:-:-:1 \@P0 $convert_in F12, F11.H0;\n", + j1c34 => "--:-:-:-:1 \@P0 $convert_in F11, F10.H1;\n", + j1c38 => "--:-:3:-:1 \@P0 $convert_in F10, F10.H0;\n", + + j2c8 => "08:-:-:-:1 \@P0 $convert_in F23, F21.H1;\n", + j2c12 => "--:-:-:-:1 \@P0 $convert_in F22, F21.H0;\n", + j2c16 => "--:-:-:-:1 \@P0 $convert_in F21, F20.H1;\n", + j2c20 => "--:-:4:-:1 \@P0 $convert_in F20, F20.H0;\n", + + j2c26 => "10:-:-:-:1 \@P0 $convert_in F33, F31.H1;\n", + j2c30 => "--:-:-:-:1 \@P0 $convert_in F32, F31.H0;\n", + j2c34 => "--:-:-:-:1 \@P0 $convert_in F31, F30.H1;\n", + j2c38 => "--:6:5:-:1 \@P0 $convert_in F30, F30.H0;\n", + ) : (), + + j1c22 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<512*4 + 00*4>], F0;\n", + j1c24 => "02:-:2:-:1 \@P1 LDG.E.CG.$vsize F0, [track0 + 4x<00 * $dsize>];\n", + + j1c40 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<512*4 + 32*4>], F1;\n", + j1c42 => "04:-:3:-:1 \@P1 LDG.E.CG.$vsize F1, [track0 + 4x<32 * $dsize>];\n", + + j2c22 => "08:4:-:-:1 \@P0 STS.128 [writeS + 4x<512*4 + 64*4>], F2;\n", + j2c24 => "08:-:4:-:1 \@P1 LDG.E.CG.$vsize F2, [track0 + 4x<64 * $dsize>];\n", + + j2c40 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<512*4 + 96*4>], F3;\n", + j2c42 => "10:6:5:-:1 \@P1 LDG.E.CG.$vsize F3, [track0 + 4x<96 * $dsize>];\n", + + j3c57 => "20:-:-:-:1 \@P1 IADD track0.CC, track0, 4x<32*16 * $dsize>;\n", + j3c62 => "--:-:-:-:1 \@P1 IADD.X track1, track1, RZ;\n", + + ) : ( + $convert_in ? ( + j0c5 => "02:-:-:-:1 $convert_in f00, f00;\n", + j0c7 => "--:-:-:-:1 $convert_in f01, f01;\n", + j0c9 => "--:-:-:-:1 $convert_in f02, f02;\n", + + j0c11 => "--:-:-:-:1 $convert_in f20, f20;\n", + j0c13 => "--:-:-:-:1 $convert_in f21, f21;\n", + j0c15 => "--:-:2:-:1 $convert_in f22, f22;\n", + + j0c17 => "--:-:-:-:1 $convert_in f10, f10;\n", + j0c19 => "--:-:-:-:1 $convert_in f11, f11;\n", + j0c21 => "--:-:4:-:1 $convert_in f12, f12;\n", + ) : (), + + j0c33 => "02:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 0)>], F00;\n", + j0c35 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 3)>], F03;\n", + j0c37 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 0)>], F30;\n", + j0c39 => "--:3:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 3)>], F33;\n", + + j0c40 => "--:-:-:-:1 \@P0 FADD tb0, TF00, TF02;\n" . + "--:-:-:-:1 \@P0 FADD tb3, TF30, TF32;\n" . + "--:-:-:-:1 \@P0 FADD ta0, f00, f20;\n" . + "--:-:-:-:1 \@P0 FADD ta1, f01, f21;\n" . + "--:-:-:-:1 \@P0 FADD ta2, f02, f22;\n", + + j0c41 => "--:-:-:-:1 \@P0 FMUL tb0, tb0, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL tb3, tb3, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL ta0, ta0, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL ta1, ta1, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL ta2, ta2, 0.5;\n", + + j0c42 => "--:-:-:-:1 \@P0 FFMA F01, TF01, 0.5, tb0;\n" . + "--:-:-:-:1 \@P0 FFMA F02, TF01, -0.5, tb0;\n" . + "--:-:-:-:1 \@P0 FFMA F31, TF31, 0.5, tb3;\n" . + "--:-:-:-:1 \@P0 FFMA F32, TF31, -0.5, tb3;\n", + + j0c45 => "04:-:-:-:1 \@P1 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n", + j0c47 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n", + j0c49 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n", + + j0c51 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n", + j0c53 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n", + j0c55 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n", + + j1c8 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 1)>], F01;\n", + j1c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 2)>], F02;\n", + j1c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 1)>], F31;\n", + j1c14 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 2)>], F32;\n", + + j1c15 => "08:-:-:-:1 \@P0 FFMA TF10, f10, 0.5, ta0;\n" . + "--:-:-:-:1 \@P0 FFMA TF20, f10, -0.5, ta0;\n" . + "--:-:-:-:1 \@P0 FFMA TF11, f11, 0.5, ta1;\n" . + "--:-:-:-:1 \@P0 FFMA TF21, f11, -0.5, ta1;\n" . + "--:-:-:-:1 \@P0 FFMA TF12, f12, 0.5, ta2;\n" . + "--:-:-:-:1 \@P0 FFMA TF22, f12, -0.5, ta2;\n", + + j1c16 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n", + j1c18 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n", + j1c20 => "--:6:2:-:1 \@P1 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n", + + j1c22 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 0)>], F10;\n", + j1c24 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 0)>], F20;\n", + j1c26 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 3)>], F13;\n", + j1c28 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 3)>], F23;\n", + + j1c29 => "--:-:-:-:1 \@P0 FADD tb1, TF10, TF12;\n" . + "--:-:-:-:1 \@P0 FADD tb2, TF20, TF22;\n", + + j1c34 => "--:-:-:-:1 \@P0 FMUL tb1, tb1, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL tb2, tb2, 0.5;\n", + + j1c39 => "--:-:-:-:1 \@P0 FFMA F11, TF11, 0.5, tb1;\n" . + "--:-:-:-:1 \@P0 FFMA F12, TF11, -0.5, tb1;\n" . + "--:-:-:-:1 \@P0 FFMA F21, TF21, 0.5, tb2;\n" . + "--:-:-:-:1 \@P0 FFMA F22, TF21, -0.5, tb2;\n", + + j2c8 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 1)>], F11;\n", + j2c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 2)>], F12;\n", + j2c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 1)>], F21;\n", + j2c14 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 2)>], F22;\n", + + + j3c57 => "20:-:-:-:1 \@P1 IADD track0.CC, track0, param_4RSKp;\n", + j3c62 => "--:-:-:-:1 \@P1 IADD.X track1, track1, RZ;\n", + ), + + j2c62 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<512*4*2>;\n", + + j3c63 => "--:-:-:Y:5 \@P0 BRA.U FILTER_LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 3) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 4; + my $rsPred = $j == 3 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +END_LOOP: +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_N, SR_CTAID.Z; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; + + +01:-:-:-:1 LOP.AND tid_31, tid, 31; + +--:-:-:-:1 BFE idx_n, idx_nkpq, 0x041c; +--:-:-:-:1 BFE idx_k, idx_nkpq, 0x0418; +--:-:-:-:1 BFE idx_P, idx_nkpq, 0x0c0c; +--:-:-:-:1 BFE idx_Q, idx_nkpq, 0x0c00; + +02:-:-:-:1 XMAD idx_N, idx_N, param_n, idx_n; +04:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +[+ + our $bsum; return $bsum ? q{ +--:-:-:-:1 XMAD bsum_offset, idx_Q, param_gridN, idx_N; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_P, param_gridQN, bsum_offset; + } : ''; ++] + +// x = grid_x << shiftX +// y = grid_y << shiftY +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; + +// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp +--:-:-:-:1 BFE.U32 superP, tid, param_superP; +--:-:-:-:1 BFE.U32 superQ, tid, param_superQ; +--:-:-:-:1 ISCADD p, superP, p, 1; +--:-:-:-:1 ISCADD q, superQ, q, 1; + + +--:-:-:-:1 LOP.AND superN, tid, param_superN; +--:-:-:-:1 SHL n, idx_N, param_shiftN; +--:-:-:-:1 IADD n, n, superN; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV32I one, 1.0; + +// readFs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid_1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 LOP.OR readFs, readFs, tid_1; +//--:-:-:-:1 SHL readFs, readFs, 3; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2) +--:-:-:-:1 LOP.AND tid_16, tid, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; +--:-:-:-:1 BFE.U32 readIs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid_16; +--:-:-:-:1 ISCADD readIs, readFs, readIs, 2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// writeCs = readFs * 512 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 12; + +// readCs = tid32 * 512 + tid_31 + tid_64 * 16 +--:-:-:-:1 SHR.U32 tid32, tid, 5; +--:-:-:-:1 SHR.U32 tid64, tid, 6; +--:-:-:-:1 ISCADD readCs, tid32, tid_31, 9; +--:-:-:-:1 ISCADD readCs, tid64, readCs, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// k = idx_K*32 + tid32<<1 +--:-:-:-:1 SHL tid32, tid32, 1; +--:-:-:-:1 ISCADD k, idx_K, tid32, 5; + +// Out00 = k*PQN + p*QN + q*N + n +// Out01 = Out00 + N +// Out10 = Out00 + QN +// Out11 = Out01 + QN +--:-:-:-:1 XMAD out_offset, q, 1x<$N>, n; +--:-:-:-:1 XMAD.LO2C out_offset, p, param_QN, out_offset; +--:-:-:-:1 XMAD.LO2C out_offset, k, param_PQN, out_offset; + + +--:-:-:-:1 MOV PQN15, param_PQN; +--:-:-:-:1 SHL PQN15, PQN15, 4; +--:-:-:-:1 IADD PQN15, PQN15, -param_PQN; + +--:-:-:-:1 IADD q2, q, 1; +--:-:-:-:1 IADD p2, p, 1; + + +--:-:-:-:1 ISETP.EQ.AND P6, PT, RZ, param_flags, PT; // ! no-op +--:-:-:-:1 ISETP.LT.AND P6, PT, n, 1x<$N>, P6; // n < N +--:-:-:-:1 ISETP.LT.AND P2, PT, p, param_P, PT; // p0 < P && n < N +--:-:-:-:1 ISETP.LT.AND P3, PT, q, 1x<$Q>, PT; // q0 < Q && n < N +--:-:-:-:1 ISETP.LT.AND P4, PT, p2, param_P, PT; // p1 < P && n < N +--:-:-:-:1 ISETP.LT.AND P5, PT, q2, 1x<$Q>, PT; // q1 < Q && n < N + +--:-:-:-:1 PSETP.AND.AND P0, PT, P2, P3, P6; // p0 && q0 +--:-:-:-:1 PSETP.AND.AND P1, PT, P2, P5, P6; // p0 && q1 +--:-:-:-:1 PSETP.AND.AND P2, PT, P4, P3, P6; // p1 && q0 +--:-:-:-:1 PSETP.AND.AND P3, PT, P4, P5, P6; // p1 && q1 +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +--:-:-:-:1 ISETP.EQ.AND P6, PT, tid_31, RZ, PT; // tid31 == 0 + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, cx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, cx7y2, alpha; + +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 1; +--:-:-:-:0 IADD out_offset, out_offset, param_PQN; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 15; +--:-:-:-:0 IADD out_offset, out_offset, PQN15; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y4, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y6, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 1; +--:-:-:-:0 IADD out_offset, out_offset, param_PQN; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + + + +11:-:-:-:1 ISETP.LT.AND P4, PT, k, 1x<$K>, PT; // k < K +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; +[+ + our ($beta, $brelu, $bprelu, $dsize, $dshift, $dtype, $Q, $N); + return $beta || $brelu || $bprelu ? qq{ +--:-:-:-:1 LEA Out0.CC, out_offset, param_X[0], $dshift; +--:-:-:-:1 LEA.HI.X Out1, out_offset, param_X[1], RZ, $dshift; + +--:-:-:-:1 \@!P0 MOV b00, RZ; +--:-:-:-:1 \@!P1 MOV b01, RZ; +--:-:-:-:1 \@!P2 MOV b10, RZ; +--:-:-:-:1 \@!P3 MOV b11, RZ; + +--:-:-:-:1 \@P0 LDG.E.CI.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>]; +--:-:5:-:1 \@P1 LDG.E.CI.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>]; +--:-:6:-:1 \@P3 LDG.E.CI.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>]; + + } : ''; ++] +[+ + our $bias; return $bias ? q{ +// sum = S + k +20:-:-:-:1 LEA Sum0.CC, k, param_S[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, k, param_S[1], RZ, 2; + +--:-:-:-:1 @!P4 MOV b00, RZ; +--:-:5:-:1 @P4 LDG.E.CI b00, [Sum]; + } : ''; ++] + + +--:-:-:-:1 LDS m00, [readCs + 4x< 0*32>]; +--:-:-:-:1 LDS m01, [readCs + 4x< 1*32>]; +--:-:-:-:1 LDS m02, [readCs + 4x< 2*32>]; +--:-:1:Y:1 LDS m03, [readCs + 4x< 3*32>]; + +--:-:-:-:1 LDS m10, [readCs + 4x< 4*32>]; +--:-:-:-:1 LDS m11, [readCs + 4x< 5*32>]; +--:-:-:-:1 LDS m12, [readCs + 4x< 6*32>]; +--:-:2:Y:1 LDS m13, [readCs + 4x< 7*32>]; + +--:-:-:-:1 LDS m20, [readCs + 4x< 8*32>]; +--:-:-:-:1 LDS m21, [readCs + 4x< 9*32>]; +--:-:-:-:1 LDS m22, [readCs + 4x<10*32>]; +--:-:3:Y:1 LDS m23, [readCs + 4x<11*32>]; + +--:-:-:-:1 LDS m30, [readCs + 4x<12*32>]; +--:-:-:-:1 LDS m31, [readCs + 4x<13*32>]; +--:-:-:-:1 LDS m32, [readCs + 4x<14*32>]; +--:-:4:Y:1 LDS m33, [readCs + 4x<15*32>]; + + +// t00 = m00+m01+m02; +// t01 = m01-m02-m03; +01:-:-:-:1 FADD t00, m00, m01; +--:-:-:-:1 FADD t00, t00, m02; +--:-:-:-:1 FADD t01, m01, -m02; +--:-:-:-:1 FADD t01, t01, -m03; +// t10 = m10+m11+m12; +// t11 = m11-m12-m13; +02:-:-:-:1 FADD t10, m10, m11; +--:-:-:-:1 FADD t10, t10, m12; +--:-:-:-:1 FADD t11, m11, -m12; +--:-:-:-:1 FADD t11, t11, -m13; +// t20 = m20+m21+m22; +// t21 = m21-m22-m23; +04:-:-:-:1 FADD t20, m20, m21; +--:-:-:-:1 FADD t20, t20, m22; +--:-:-:-:1 FADD t21, m21, -m22; +--:-:-:-:1 FADD t21, t21, -m23; +// t30 = m30+m31+m32; +// t31 = m31-m32-m33; +08:-:-:-:1 FADD t30, m30, m31; +--:-:-:-:1 FADD t30, t30, m32; +--:-:-:-:1 FADD t31, m31, -m32; +--:-:-:-:1 FADD t31, t31, -m33; +// y00 = t00+t10+t20; +// y01 = t01+t11+t21; +--:-:-:-:1 FADD s00, t00, t10; +--:-:-:-:1 FADD s00, s00, t20; +--:-:-:-:1 FADD s01, t01, t11; +--:-:-:-:1 FADD s01, s01, t21; +// y10 = t10-t20-t30; +// y11 = t11-t21-t31; +--:-:-:-:1 FADD s10, t10, -t20; +--:-:-:-:1 FADD s10, s10, -t30; +--:-:-:-:1 FADD s11, t11, -t21; +--:-:-:-:3 FADD s11, s11, -t31; + +[+ + our $bias; return $bias ? q{ +10:-:-:-:1 @P0 FADD s00, s00, b00; +--:-:-:-:1 @P1 FADD s01, s01, b00; +--:-:-:-:1 @P2 FADD s10, s10, b00; +--:-:-:-:1 @P3 FADD s11, s11, b00; + } : ''; ++] +[+ + our $relu; return $relu ? q{ +// maximum(x, 0) + slope * minimum(0, x) +--:-:-:-:1 @P0 FMNMX s00, s00, RZ, !PT; +--:-:-:-:1 @P1 FMNMX s01, s01, RZ, !PT; +--:-:-:-:1 @P2 FMNMX s10, s10, RZ, !PT; +--:-:-:-:1 @P3 FMNMX s11, s11, RZ, !PT; + } : ''; ++] +[+ + our $prelu; return $prelu ? q{ +// maximum(x, 0) + slope * minimum(0, x) +--:-:-:-:1 @P0 FMNMX b00, s00, RZ, !PT; +--:-:-:-:1 @P1 FMNMX b01, s01, RZ, !PT; +--:-:-:-:1 @P2 FMNMX b10, s10, RZ, !PT; +--:-:-:-:1 @P3 FMNMX b11, s11, RZ, !PT; + +--:-:-:-:1 @P0 FMNMX x00, s00, RZ, PT; +--:-:-:-:1 @P1 FMNMX x01, s01, RZ, PT; +--:-:-:-:1 @P2 FMNMX x10, s10, RZ, PT; +--:-:-:-:1 @P3 FMNMX x11, s11, RZ, PT; + +--:-:-:-:1 @P0 FFMA s00, x00, param_beta, b00; +--:-:-:-:1 @P1 FFMA s01, x01, param_beta, b01; +--:-:-:-:1 @P2 FFMA s10, x10, param_beta, b10; +--:-:-:-:1 @P3 FFMA s11, x11, param_beta, b11; + } : ''; ++] + + + +[+ + our ($beta, $brelu, $bprelu, $convert_in); + return $convert_in && ($beta || $brelu || $bprelu) ? qq{ +10:-:1:-:1 \@P0 $convert_in b00, b00; +--:-:2:-:1 \@P1 $convert_in b01, b01; +20:-:3:-:1 \@P2 $convert_in b10, b10; +--:-:4:-:1 \@P3 $convert_in b11, b11; + } : ''; ++] +[+ + our $beta; return $beta ? q{ +11:-:-:-:1 @P0 FFMA s00, b00, param_beta, s00; +02:-:-:-:1 @P1 FFMA s01, b01, param_beta, s01; +24:-:-:-:1 @P2 FFMA s10, b10, param_beta, s10; +08:-:-:-:1 @P3 FFMA s11, b11, param_beta, s11; + } : ''; ++] +[+ + our $brelu; return $brelu ? q{ +//delta *= x > 0 +11:-:-:-:1 FSETP.GT.AND P0, PT, b00, RZ, PT; +02:-:-:-:1 FSETP.GT.AND P1, PT, b01, RZ, PT; +24:-:-:-:1 FSETP.GT.AND P2, PT, b10, RZ, PT; +08:-:-:-:1 FSETP.GT.AND P3, PT, b11, RZ, PT; +--:-:-:-:1 @!P0 MOV s00, RZ; +--:-:-:-:1 @!P1 MOV s01, RZ; +--:-:-:-:1 @!P2 MOV s10, RZ; +--:-:-:-:1 @!P3 MOV s11, RZ; +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:5 @!P4 R2P PR, RZ, 0x0f; + } : ''; ++] +[+ + our $bprelu; return $bprelu ? q{ +//delta *= ((x > 0) + slope * (x < 0)) +11:-:-:-:1 FSETP.GT.AND P0, PT, b00, RZ, PT; +02:-:-:-:1 FSETP.GT.AND P1, PT, b01, RZ, PT; +24:-:-:-:1 FSETP.GT.AND P2, PT, b10, RZ, PT; +08:-:-:-:1 FSETP.GT.AND P3, PT, b11, RZ, PT; +--:-:-:-:1 SEL x00, one, RZ, P0; +--:-:-:-:1 SEL x01, one, RZ, P1; +--:-:-:-:1 SEL x10, one, RZ, P2; +--:-:-:-:1 SEL x11, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b00, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b01, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b10, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b11, RZ, PT; +--:-:-:-:1 SEL b00, one, RZ, P0; +--:-:-:-:1 SEL b01, one, RZ, P1; +--:-:-:-:1 SEL b10, one, RZ, P2; +--:-:-:-:1 SEL b11, one, RZ, P3; +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; +--:-:-:-:1 FFMA b00, b00, param_beta, x00; +--:-:-:-:1 FFMA b01, b01, param_beta, x01; +--:-:-:-:1 FFMA b10, b10, param_beta, x10; +--:-:-:-:1 FFMA b11, b11, param_beta, x11; +--:-:-:-:1 FMUL s00, s00, b00; +--:-:-:-:1 FMUL s01, s01, b01; +--:-:-:-:1 FMUL s10, s10, b10; +--:-:-:-:1 FMUL s11, s11, b11; + } : ''; ++] +[+ + our $bsum; return $bsum ? q{ +--:-:-:-:1 MOV sum0, RZ; +--:-:-:-:1 @P0 FADD sum0, s00, sum0; +--:-:-:-:1 @P1 FADD sum0, s01, sum0; +--:-:-:-:1 @P2 FADD sum0, s10, sum0; +--:-:-:-:1 @P3 FADD sum0, s11, sum0; + } : ''; ++] + + +[+ + our $convert_out; + return $convert_out ? qq{ +--:-:1:-:1 $convert_out s00, s00; +--:-:2:-:1 $convert_out s01, s01; +--:-:3:-:1 $convert_out s10, s10; +--:-:4:-:1 $convert_out s11, s11; + } : ''; ++] + + + +--:-:-:-:1 LEA Out0.CC, out_offset, param_O[0], [+ dshift() +]; +--:-:-:-:1 LEA.HI.X Out1, out_offset, param_O[1], RZ, [+ dshift() +]; + +// k < K && R2P && output +01:-:-:-:1 @P0 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 0*$N>], s00; +02:-:-:-:1 @P1 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 1*$N>], s01; +04:-:-:-:1 @P2 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 0*$N>], s10; +08:1:-:-:1 @P3 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 1*$N>], s11; + + +[+ + our $bsum; + return $bsum ? q{ + +--:-:-:-:1 XMAD.LO2C b00, k, param_gridPQN, bsum_offset; + +--:-:-:-:1 LEA Sum0.CC, b00, param_S[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, b00, param_S[1], RZ, 2; + +--:-:-:-:1 PSETP.AND.AND P5, PT, P4, P6, PT; // k < K && tid31 == 0 + +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 1, 0x1f; +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 2, 0x1f; +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 4, 0x1f; +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 8, 0x1f; +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 16, 0x1f; +10:-:-:-:2 FADD sum0, sum1, sum0; + +--:5:-:-:1 @P5 STG.E.CG [Sum], sum0; + + } : ''; ++] + +--:-:-:-:5 RET; + + + +// T0 = np.empty((4,4)) +// T1 = np.empty((4,4)) +// +// for O, I in ((T0, I), (T1, T0.T)): +// +// O[0,:] = I[0,:] - I[2,:] +// O[1,:] = I[1,:] + I[2,:] +// O[2,:] = I[2,:] - I[1,:] +// O[3,:] = I[1,:] - I[3,:] +// +// Iw[:] = T1.T +// +// 0 = i00 +// 1 = i01 +// 2 = i02 +// 3 = i03 +// 4 = i30 +// 5 = i31 +// 6 = i32 +// 7 = i33 +// 8 = i13 +// 9 = i12 +// 10 = i11 +// 11 = i10 +// 12 = i23, TI23, I23 +// 13 = i22, TI22 +// 14 = i21, TI21 +// 15 = i20, TI20, I20 +// 16 = TI00, I00, TI10, I10, I21, I01 +// 17 = TI01, I11 +// 18 = TI02, I12 +// 19 = TI03, I03, TI11, I31 +// 20 = TI30, I30, TI12, I32 +// 21 = TI31 +// 22 = TI32 +// 23 = TI33, I33, TI13, I13, I22, I02 +// +// +// TI00 = i00 - i20 +// TI01 = i01 - i21 +// TI02 = i02 - i22 +// TI03 = i03 - i23 +// # load 0 +// +// TI30 = i10 - i30 +// TI31 = i11 - i31 +// TI32 = i12 - i32 +// TI33 = i13 - i33 +// # load 3 +// +// I00 = TI00 - TI02 +// I03 = TI01 - TI03 +// I30 = TI30 - TI32 +// I33 = TI31 - TI33 +// # store 0 +// +// # wait 0 +// TI10 = i10 + i20 +// TI11 = i11 + i21 +// TI12 = i12 + i22 +// TI13 = i13 + i23 +// +// TI20 = i20 - i10 +// TI21 = i21 - i11 +// TI22 = i22 - i12 +// TI23 = i23 - i13 +// +// #load 1 +// +// I10 = TI10 - TI12 +// I20 = TI20 - TI22 +// I13 = TI11 - TI13 +// I23 = TI21 - TI23 +// # store 1 +// +// # wait 1 +// I21 = TI21 + TI22 +// I22 = TI22 - TI21 +// # store 2 +// +// # load 2 +// +// # wait 2 +// I01 = TI01 + TI02 +// I02 = TI02 - TI01 +// I11 = TI11 + TI12 +// I12 = TI12 - TI11 +// I31 = TI31 + TI32 +// I32 = TI32 - TI31 +// #store 3 + + + +// T0 = np.empty((4,3)) +// T1 = np.empty((4,4)) +// +// for O, I in ((T0, F), (T1, T0.T)): +// +// t0 = (I[0,:] + I[2,:])*0.5 +// +// O[0,:] = I[0,:] +// O[1,:] = t0 + I[1,:]*0.5 +// O[2,:] = t0 - I[1,:]*0.5 +// O[3,:] = I[2,:] +// +// Fw[:] = T1.T +// +// 0 = f00, TF00, F00 +// 1 = f01, TF01 +// 2 = f02, TF02, F03 +// 3 = f10 +// 4 = f11 +// 5 = f12 +// 6 = f20, TF30, F30 +// 7 = f21, TF31 +// 8 = f22, TF32, F33 +// 9 = tb3, F32 +// 10 = tb0, F02 +// 11 = ta2, TF22, F23 +// 12 = ta0, TF20, F20 +// 13 = ta1, TF21 +// 14 = F01 +// 15 = F31 +// 16 = TF10, F10 +// 17 = TF11 +// 18 = TF12, F13 +// 19 = tb1, F12 +// 20 = tb2, F22 +// 21 = F11 +// 22 = F21 +// 23 = +// +// +// TF00 = f00 +// TF01 = f01 +// TF02 = f02 +// TF30 = f20 +// TF31 = f21 +// TF32 = f22 +// +// F00 = TF00 +// F03 = TF02 +// F30 = TF30 +// F33 = TF32 +// +// # store 0 +// +// tb0 = TF00 + TF02 +// tb3 = TF30 + TF32 +// ta0 = f00 + f20 +// ta1 = f01 + f21 +// ta2 = f02 + f22 +// +// tb0 = tb0 * 0.5 +// tb3 = tb3 * 0.5 +// ta0 = ta0 * 0.5 +// ta1 = ta1 * 0.5 +// ta2 = ta2 * 0.5 +// +// F01 = tb0 + TF01*0.5 +// F02 = tb0 - TF01*0.5 +// F31 = tb3 + TF31*0.5 +// F32 = tb3 - TF31*0.5 +// +// # wait 0 +// # load 0, 2 +// # store 1 +// +// TF10 = ta0 + f10*0.5 +// TF20 = ta0 - f10*0.5 +// TF11 = ta1 + f11*0.5 +// TF21 = ta1 - f11*0.5 +// TF12 = ta2 + f12*0.5 +// TF22 = ta2 - f12*0.5 +// +// # load 1 +// +// F10 = TF10 +// F20 = TF20 +// F13 = TF12 +// F23 = TF22 +// +// # store 2 +// +// tb1 = TF10 + TF12 +// tb2 = TF20 + TF22 +// tb1 = tb1 * 0.5 +// tb2 = tb2 * 0.5 +// +// F11 = tb1 + TF11*0.5 +// F12 = tb1 - TF11*0.5 +// F21 = tb2 + TF21*0.5 +// F22 = tb2 - TF21*0.5 +// +// # store 3// \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_2x2_5x5_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_2x2_5x5_32x32.sass new file mode 100644 index 0000000..0fcb767 --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_winograd_2x2_5x5_32x32.sass @@ -0,0 +1,1589 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $type; +our $dtype = $type eq 'h' ? 'U16' : '32'; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $dshift = $type eq 'h' ? '1' : '2'; +our $dsize = $type eq 'h' ? '2' : '4'; +our $vsize = $type eq 'h' ? '64' : '128'; +sub dtype { return $dtype; } +sub dsize { return $dsize; } +sub dshift { return $dshift; } +sub vsize { return $vsize; } +-] + + + + addr_zero : 4x<32*36*2*4 + 64 + 0> + addr_idx_Y : 4x<32*36*2*4 + 64 + 4> + addr_idx_X : 4x<32*36*2*4 + 64 + 5> + addr_idx_K : 4x<32*36*2*4 + 64 + 6> + + param_O[0] : c[0x0][0x140] + param_O[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_F[0] : c[0x0][0x150] + param_F[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_flags : c[0x0][0x15c] + param_C : c[0x0][0x160] + param_K : c[0x0][0x164] + param_N : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_HWN : c[0x0][0x174] + param_WN : c[0x0][0x178] + param_Y2 : c[0x0][0x17c] + param_GX : c[0x0][0x180] + param_Xk : c[0x0][0x184] + param_k : c[0x0][0x188] + param_magic_Xk : c[0x0][0x18c] + param_shift_Xk : c[0x0][0x190] + param_magic_k : c[0x0][0x194] + param_shift_k : c[0x0][0x198] + param_P : c[0x0][0x19c] + param_Q : c[0x0][0x1a0] + param_QN : c[0x0][0x1a4] + param_PQN : c[0x0][0x1a8] + param_PQNp : c[0x0][0x1ac] + param_PQN15p : c[0x0][0x1b0] + param_shiftY : c[0x0][0x1b4] + param_shiftX : c[0x0][0x1b8] + param_shiftN : c[0x0][0x1bc] + param_superY : c[0x0][0x1c0] + param_superX : c[0x0][0x1c4] + param_superN : c[0x0][0x1c8] + param_SuperY : c[0x0][0x1cc] + param_SuperX : c[0x0][0x1d0] + param_SuperN : c[0x0][0x1d4] + param_pad_x : c[0x0][0x1d8] + param_pad_y : c[0x0][0x1dc] + param_HWN2p : c[0x0][0x1e0] + param_C_1152 : c[0x0][0x1e4] + + + + + 0-63 : czero<00-63> + + // Image Transform + 52 = i00, TI00, I00 + 53 = i10, TI50, I50 + 54 = i01, TI01, I05 + 55 = i11, TI51, I55 + 56 = TI10, I10 + 57 = TI20, I20 + 58 = TI30, I30 + 59 = TI40, I40 + 60 = TI41, I45 + 61 = TI31, I35 + 62 = TI21, I25 + 63 = TI11, I15 + 64-67 : I0<1-4> + 68-71 : I5<1-4> + 72-75 : I1<1-4> + 76-79 : I2<1-4> + 80-83 : I3<1-4> + 84-87 : I4<1-4> + + // Filter Transform + 52-87 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>, F4<0-3>, F5<0-3>, F6<0-3>, F7<0-3>, F8<0-3> + + // Load Loop Registers + 3, 2,11,10 : clx<0-3>y0 + 7, 6,15,14 : clx<0-3>y1 + 1, 0, 9, 8 : clx<0-3>y2 + 5, 4,13,12 : clx<0-3>y3 + 19,18,27,26 : clx<0-3>y4 + 23,22,31,30 : clx<0-3>y5 + 17,16,25,24 : clx<0-3>y6 + 21,20,29,28 : clx<0-3>y7 + + 32-43 : jl0Ix<0-3>, jl0Fy<0-7> + 44-51 : jl1Ix<0-3>, jl1Fy<4-7> + 36-39 : jl1Fy<0-3> + + 32-51 ~ partialC, c, idx_K, idx_Y, idx_X, idx_N, tid31, gx, gy, offset, nn, x1, x2, y1, mask_x + 52-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, idx_Y2, idx_X2, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, super_x, super_y + 87 = tid + + // Compute Loop Registers + 3, 2,11,10,19,18,27,26 : ccx<0-7>y0 + 7, 6,15,14,23,22,31,30 : ccx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2 + 5, 4,13,12,21,20,29,28 : ccx<0-7>y3 + 35,34,43,42,51,50,59,58 : ccx<0-7>y4 + 39,38,47,46,55,54,63,62 : ccx<0-7>y5 + 33,32,41,40,49,48,57,56 : ccx<0-7>y6 + 37,36,45,44,53,52,61,60 : ccx<0-7>y7 + + 64-79 : jc0Ix<0-7>, jc0Fy<0-7> + 80-91 : jc1Ix<4-7>, jc1Fy<0-7> + 64-67 : jc1Ix<0-3> + + 64-86 ~ tid16, tid_1, tid128 + + // Shared Registers + 88-89 : track<0-1> + 92-95 ~ C, swapBuf, readFs, readIs + 90-91 ~ writeS, preds + + // Load Loop Finish + 32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1 + 48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16 + + // Compute Loop Finish + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + 64-87 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, four, z<1-5>, mask_q, offsetO, sign + 90-95 ~ writeCs, readCs, k, pred30, pred36, tid31_4 + 88-89 : Out<0-1> + + 3, 2,11,10,19,18 : m<0-5>0 + 27, 1,26, 0, 9, 8 : m<0-5>1 + 16,17,24,25,64,65 : m<0-5>2 + 66,67,68,69,70,71 : m<0-5>3 + 72,73,74,75,76,77 : m<0-5>4 + 78,79,80,81,82,83 : m<0-5>5 + + 3, 2,11,10,19,18 : w<0-5>0 + 27, 1,26, 0, 9, 8 : w<0-5>1 + 16,17,24,25,64,65 : w<0-5>2 + 66,67,68,69,70,71 : w<0-5>3 + 72,73,74,75,76,77 : w<0-5>4 + 78,79,80,81,82,83 : w<0-5>5 + + 3, 2,11,10,19,18 : s<0-5>0 + 27, 1,26, 0, 9, 8 : s<0-5>1 + 16,17,24,25,64,65 : s<0-5>2 + 66,67,68,69,70,71 : s<0-5>3 + 72,73,74,75,76,77 : s<0-5>4 + 78,79,80,81,82,83 : s<0-5>5 + + 85,84,86,87 : t<0-3>0 + 85,87,84,86 : t<0-3>1 + 85,84,87,86 : t<0-3>2 + 85,84,87,86 : t<0-3>3 + 85,84,87,86 : t<0-3>4 + 85,84,87,86 : t<0-3>5 + 85,84,87,86 : r0<0-3> + 85,84,87,86 : r1<0-3> + 85,87,86,84 : r2<0-3> + 84,85,86,87 : r3<0-3> + 85,84,87,86 : r4<0-3> + 84,85,87,86 : r5<0-3> + + + +--:-:-:-:0 MOV C, param_C; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:-:-:1 MOV swapBuf, 4x<32*36*2*2>; +01:-:-:-:0 ISETP.GE.AND P0, PT, tid, 128, PT; +--:-:-:-:1 STS.128 [addr_zero], RZ; +--:-:-:Y:c LOP.AND partialC, C, 1; +--:-:-:-:0 IADD C, C, partialC; +--:-:-:-:5 @P0 BRA.U COMPUTE_SETUP; + +############################################################## +LOAD_SETUP: + +--:-:1:-:1 S2R idx_YXk, SR_CTAID.X; +--:-:2:-:1 S2R idx_K, SR_CTAID.Y; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +] + +--:-:-:-:1 ISETP.EQ.AND P0, PT, tid, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, tid, 64, PT; + +// idx_Y2 = idx_YXk / blk_Xk +--:-:-:-:1 MOV magic_Xk, param_magic_Xk; +--:-:-:-:1 IADD negXk, RZ, -param_Xk; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_Xk, 1, PT; +01:-:-:-:1 @P3 XMAD div1, idx_YXk, magic_Xk, RZ; +--:-:-:-:1 @P3 XMAD div2, idx_YXk, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, idx_YXk.H1, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk, div1; +--:-:-:-:1 @P3 IADD3.RS idx_Y2, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 idx_Y2, idx_Y2, param_shift_Xk; +--:-:-:-:1 @!P3 SHR.U32 idx_Y2, idx_YXk, param_shift_Xk; + +// idx_Xk = idx_YXk % blk_Xk +--:-:-:-:1 XMAD.LO2 idx_Xk, negXk, idx_Y2, idx_YXk; + +// idx_X2 = idx_Xk / blk_k +// idx_k = idx_Xk % blk_k +--:-:-:-:1 XMAD idx_X2, idx_Xk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_X2, idx_X2, param_shift_k; +--:-:-:-:1 XMAD idx_k, idx_X2, param_k, RZ; +--:-:-:-:1 IADD idx_k, -idx_k, idx_Xk; + +// idx_K = idx_K * blk_k + idx_k +02:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +//--:-:-:-:1 MOV idx_X, idx_X2; +//--:-:-:-:1 MOV idx_Y, idx_Y2; + +// gx = x2 +// gy = y2 * 2 +--:-:-:-:1 MOV idx_X, idx_X2; +--:-:-:-:1 SHL idx_Y, idx_Y2, 1; + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// if y2 != Y2: +// gy += (gx&1) ^ ((gx&2)>>1) +// gx /= 2 +--:-:-:-:1 ISETP.NE.AND P4, PT, idx_Y2, param_Y2, PT; +--:-:-:-:1 @P4 LOP.AND x1, idx_X, 1; +--:-:-:-:1 @P4 BFE.U32 x2, idx_X, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P4 LOP.XOR x1, x1, x2; +--:-:-:-:1 @P4 IADD idx_Y, idx_Y, x1; +--:-:-:-:1 @P4 SHR.U32 idx_X, idx_X, 1; + +// Scan backwards on odd rows +// if y2 & 1: +// gx = gridX - gx - 1 +--:-:-:-:1 LOP.AND.NZ P5, RZ, idx_Y2, 1; +--:-:-:-:1 @P5 IADD idx_X, -idx_X, param_GX; +--:-:-:-:1 @P5 IADD idx_X, idx_X, -1; + +--:-:-:-:1 @P0 STS [addr_idx_Y], idx_Y; +--:-:-:-:1 @P0 STS [addr_idx_X], idx_X; +--:-:-:-:1 @P0 STS [addr_idx_K], idx_K; + +// x = gx << shiftX +// y = gy << shiftY +--:-:-:-:1 SHL gx, idx_X, param_shiftX; +--:-:-:-:1 SHL gy, idx_Y, param_shiftY; + +// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp +--:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 super_y, tid, param_superY; +--:-:-:-:1 ISCADD gx, super_x, gx, 1; +--:-:-:-:1 ISCADD gy, super_y, gy, 1; + +--:-:-:-:1 LOP.AND tid32_2, tid, -32; +--:-:-:-:1 SHR.U32 tid32_2, tid32_2, 2; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid32_2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD3 readFs, readFs, tid1, tid32_2; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*36*2>, 4; + +// c = (tid & 32) >> 5 +--:-:-:-:1 BFE.U32 c, tid, 0x105; // 1 bits at position 5 + +// P6 = c == partialC == 1 +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, partialC, P6; + +--:-:-:-:1 LOP.AND tid31, tid, 31; + + +04:-:-:-:5 @P1 BRA.U FILTER_SETUP; + +############################################################## +IMAGE_SETUP: + + + +// writeS = c*32*36 + tid31 +--:-:-:-:1 XMAD writeS, c, 1152, tid31; +--:-:-:-:1 SHL writeS, writeS, 2; + +--:-:-:-:1 STS [writeS + 4x<32*0>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*1>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*2>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*3>], RZ; + +// n = idx_N< +--:-:-:-:1 @!P0 MOV i00, RZ; +--:-:2:-:1 @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>]; +--:-:-:-:1 @!P2 MOV i10, RZ; +--:-:3:-:1 @P2 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>]; +--:-:-:-:1 @!P1 MOV i01, RZ; +--:-:4:-:1 @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>]; +--:-:-:-:1 @!P3 MOV i11, RZ; +--:6:5:-:1 @P3 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>]; + + + +--:-:-:-:5 BAR.SYNC 0; + +20:-:-:-:0 IADD track0.CC, track0, -partialC; + +--:-:-:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:1:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; + +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:2 IADD swapBuf, RZ, -swapBuf; +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:5 BRA.U IMAGE_LOOP; + +############################################################## +FILTER_SETUP: + + + +// writeS = (c*32*36 + (tid & 31)*4 + 32*36*2)*4 +--:-:-:-:1 ISCADD writeS, tid31, 4x<32*36*2>, 4; +--:-:-:-:1 XMAD writeS, c, 4x<32*36>, writeS; + +--:-:-:-:1 STS.128 [writeS], RZ; + +// offset = c*32*36 + tid31*4 +--:-:-:-:1 SHL tid31, tid31, 2; +--:-:-:-:1 XMAD offset, c, 1x<32*36>, tid31; + +// (kBlks,C,6,6,32) +// offset += (idx_K*C*32*36) * itemsize; +--:-:-:-:1 XMAD.LO2C offset, idx_K, param_C_1152, offset; +--:-:-:-:1 LEA track0.CC, offset, param_F[0], [+ dshift() +]; +--:-:-:-:1 LEA.HI.X track1, offset, param_F[1], RZ, [+ dshift() +]; + +--:-:-:-:1 XMAD partialC, partialC, 1x<32*36 * $dsize>, RZ; + +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F0, [track + 4x<0*32 * $dsize>]; +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F1, [track + 4x<1*32 * $dsize>]; +--:-:2:-:1 @!P6 LDG.E.[+ vsize() +] F2, [track + 4x<2*32 * $dsize>]; + +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F0, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F1, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F2, [addr_zero]; + +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F3, [track + 4x<3*32 * $dsize>]; +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F4, [track + 4x<4*32 * $dsize>]; +--:-:3:-:1 @!P6 LDG.E.[+ vsize() +] F5, [track + 4x<5*32 * $dsize>]; + +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F3, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F4, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F5, [addr_zero]; + +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F6, [track + 4x<6*32 * $dsize>]; +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F7, [track + 4x<7*32 * $dsize>]; +--:6:4:-:1 @!P6 LDG.E.[+ vsize() +] F8, [track + 4x<8*32 * $dsize>]; + +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F6, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F7, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F8, [addr_zero]; + + +--:-:-:-:5 BAR.SYNC 0; + +20:-:-:-:0 IADD track0.CC, track0, -partialC; + +--:-:-:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:1:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; + +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:2 IADD swapBuf, RZ, -swapBuf; +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:5 BRA.U FILTER_LOOP; + +############################################################## + +COMPUTE_SETUP: + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +--:-:-:-:1 IADD tid128, tid, -128; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readFs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid128, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readIs, tid128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid16; +--:-:-:-:1 ISCADD readIs, readIs, 4x<32*4 + 32*36*2*2>, 4; + +--:-:-:-:1 LOP.AND tid_1, tid128, 1; +--:-:-:-:1 LOP.AND readFs, tid128, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 IADD3 readFs, readFs, tid16, tid_1; +--:-:-:-:0 ISCADD readFs, readFs, 4x<32*4 + 32*36*2*3>, 4; + + +--:-:-:-:5 BAR.SYNC 0; + +// Let Load loop run once to transform initial load and store to shared. +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>]; +--:-:1:-:2 LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>]; + +COMPUTE_LOOP: +[+ + my %insert = ( + + j0c33 => "--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT;\n" . + "--:-:-:-:1 IADD C, C, -2;\n", + + j0c62 => "02:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j1c63 => "--:-:-:Y:5 \@P0 BRA.U COMPUTE_LOOP;\n" . + "--:-:-:Y:5 BRA.U COMPUTE_FINISH;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 1) + { + my $odd = $j; + my $nOdd = 1 - $j; + my $rsPred = $j == 1 ? '@P0' : ' '; + my $bar = $j == 0 ? '2' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd; + + $insert{"j${j}c31"} = sprintf "--:%s:1:-:1 %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd; + + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + #$stall = '4' if $stall && $c % 2 == 0 && $j == 0 && $c > 16; + + my $yield = ($c % 5 == 0) && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +IMAGE_LOOP: +--:-:-:-:1 ISETP.GT.AND P6, PT, C, 2, PT; +[+ + our ($dtype, $dsize, $convert_in, $W, $N); + my %insert = ( + + j0c0 => "--:-:-:-:1 ISETP.GT.AND P5, PT, C, RZ, PT;\n" . + "--:-:-:-:1 IADD C, C, -2;\n", + + $convert_in ? ( + j0c1 => "02:-:2:-:1 F2F.F32.F16 i00, i00;\n", + j0c2 => "04:-:3:-:1 F2F.F32.F16 i10, i10;\n", + j0c3 => "08:-:4:-:1 F2F.F32.F16 i01, i01;\n", + j0c4 => "10:-:5:-:1 F2F.F32.F16 i11, i11;\n", + ) : (), + + j0c5 => "02:-:-:-:1 STS [writeS + 4x<32*(0*6 + 0)>], I00;\n", + j0c6 => "04:-:-:-:1 STS [writeS + 4x<32*(5*6 + 0)>], I50;\n", + + j0c7 => "--:-:-:-:1 FFMA TI10, i10, 0.75, i00;\n" . + "--:-:-:-:1 FFMA TI20, i10, -0.75, i00;\n" . + "--:-:-:-:1 FFMA TI30, i10, 1.50, i00;\n" . + "--:-:-:-:1 FFMA TI40, i10, -1.50, i00;\n" . + "--:-:-:-:1 IADD track0.CC, track0, param_HWN2p;\n" . + "--:-:-:-:1 @!P6 MOV preds, RZ;\n", + + j0c8 => "08:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 5)>], I05;\n", + j0c9 => "10:6:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 5)>], I55;\n", + + j0c10 => "--:-:-:-:0 FFMA TI11, i11, 0.75, i01;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 0)>], I10;\n" . + "--:-:-:-:0 FFMA TI21, i11, -0.75, i01;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 0)>], I20;\n" . + "--:-:-:-:0 FFMA TI31, i11, 1.50, i01;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 0)>], I30;\n" . + "--:-:-:-:0 FFMA TI41, i11, -1.50, i01;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 0)>], I40;\n" . + "--:-:-:-:1 R2P PR, preds, 0xf;\n" . + "--:-:-:-:1 IADD.X track1, track1, RZ;\n", + + j0c11 => "--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n", + j0c13 => "--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n", + j0c19 => "--:-:1:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + + j0c14 => "--:-:-:-:0 FFMA I01, TI01, 0.75, TI00;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 5)>], I15;\n" . + "--:-:-:-:0 FFMA I02, TI01, -0.75, TI00;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 5)>], I25;\n" . + "--:-:-:-:0 FFMA I03, TI01, 1.50, TI00;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 5)>], I35;\n" . + "--:-:-:-:0 FFMA I04, TI01, -1.50, TI00;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 5)>], I45;\n", + + j0c15 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 1)>], I01;\n", + j0c16 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 2)>], I02;\n", + j0c17 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 3)>], I03;\n", + j0c18 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 4)>], I04;\n", + + j0c20 => "--:-:-:-:1 FFMA I51, TI51, 0.75, TI50;\n" . + "--:-:-:-:1 FFMA I52, TI51, -0.75, TI50;\n" . + "--:-:-:-:1 FFMA I53, TI51, 1.50, TI50;\n" . + "--:-:-:-:1 FFMA I54, TI51, -1.50, TI50;\n", + + j0c21 => "20:-:2:-:1 \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n", + j0c22 => "--:-:3:-:1 \@P2 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n", + j0c23 => "--:-:4:-:1 \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n", + j0c24 => "--:-:5:-:1 \@P3 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n", + + j0c25 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 1)>], I51;\n", + j0c26 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 2)>], I52;\n", + j0c27 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 3)>], I53;\n", + j0c28 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 4)>], I54;\n", + + j0c29 => "--:-:-:-:1 FFMA I11, TI11, 0.75, TI10;\n" . + "--:-:-:-:1 FFMA I12, TI11, -0.75, TI10;\n" . + "--:-:-:-:1 FFMA I13, TI11, 1.50, TI10;\n" . + "--:-:-:-:1 FFMA I14, TI11, -1.50, TI10;\n", + + j0c30 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 1)>], I11;\n", + j0c31 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 2)>], I12;\n", + j1c0 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 3)>], I13;\n", + j1c1 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 4)>], I14;\n", + + j1c2 => "--:-:-:-:1 FFMA I21, TI21, 0.75, TI20;\n" . + "--:-:-:-:1 FFMA I22, TI21, -0.75, TI20;\n" . + "--:-:-:-:1 FFMA I23, TI21, 1.50, TI20;\n" . + "--:-:-:-:1 FFMA I24, TI21, -1.50, TI20;\n", + + j1c3 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 1)>], I21;\n", + j1c4 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 2)>], I22;\n", + j1c5 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 3)>], I23;\n", + j1c6 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 4)>], I24;\n", + + j1c7 => "--:-:-:-:1 FFMA I31, TI31, 0.75, TI30;\n" . + "--:-:-:-:1 FFMA I32, TI31, -0.75, TI30;\n" . + "--:-:-:-:1 FFMA I33, TI31, 1.50, TI30;\n" . + "--:-:-:-:1 FFMA I34, TI31, -1.50, TI30;\n", + + j1c8 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 1)>], I31;\n", + j1c9 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 2)>], I32;\n", + j1c10 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 3)>], I33;\n", + j1c11 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 4)>], I34;\n", + + j1c12 => "--:-:-:-:1 FFMA I41, TI41, 0.75, TI40;\n" . + "--:-:-:-:1 FFMA I42, TI41, -0.75, TI40;\n" . + "--:-:-:-:1 FFMA I43, TI41, 1.50, TI40;\n" . + "--:-:-:-:1 FFMA I44, TI41, -1.50, TI40;\n", + + j1c13 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 1)>], I41;\n", + j1c14 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 2)>], I42;\n", + j1c15 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 3)>], I43;\n", + j1c16 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 4)>], I44;\n", + + j1c17 => "--:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P5 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P5 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P5 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P5 IADD swapBuf, RZ, -swapBuf;\n", + + j1c18 => "--:-:-:-:1 \@P5 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n", + j1c20 => "--:-:-:-:1 \@P5 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n", + j1c22 => "--:-:1:-:1 \@P5 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n", + + j1c31 => "--:-:-:Y:5 \@P5 BRA.U IMAGE_LOOP;\n" . + "--:-:-:Y:5 BRA.U LOAD_FINISH;", + + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "01" : '--'; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + + +FILTER_LOOP: +--:-:-:-:1 ISETP.GT.AND P0, PT, C, RZ, PT; +20:-:-:-:1 IADD track0.CC, track0, 1x<32*36*2 * $dsize>; +--:-:-:-:1 ISETP.GT.AND P1, PT, C, 2, PT; +--:-:-:-:1 IADD C, C, -2; +[+ + our ($vsize, $dsize, $convert_in); + my %insert = ( + + j0c3 => "--:-:-:-:1 IADD.X track1, track1, RZ;\n", + + j0c0 => "--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n", + j0c2 => "--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n", + j0c18 => "--:-:1:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + + j1c12 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n", + j1c14 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n", + j1c16 => "--:-:1:-:1 \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n", + + $convert_in ? ( + + j0c1 => "02:-:-:-:1 F2F.F32.F16 F03, F01.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F02, F01.H0;\n", + j0c4 => "--:-:-:-:1 F2F.F32.F16 F01, F00.H1;\n" . + "--:-:2:-:1 F2F.F32.F16 F00, F00.H0;\n", + + j0c5 => "--:-:-:-:1 F2F.F32.F16 F13, F11.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F12, F11.H0;\n", + j0c6 => "--:-:-:-:1 F2F.F32.F16 F11, F10.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 F10, F10.H0;\n", + + j0c7 => "--:-:-:-:1 F2F.F32.F16 F23, F21.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F22, F21.H0;\n", + j0c8 => "--:-:-:-:1 F2F.F32.F16 F21, F20.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 F20, F20.H0;\n", + + j0c9 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*32*4>], F0;\n", + j0c10 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], F1;\n", + j0c11 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], F2;\n", + + j0c13 => "02:-:-:-:1 \@P1 LDG.E.$vsize F0, [track + 4x<0*32 * $dsize>];\n", + j0c14 => "10:-:-:-:1 \@P1 LDG.E.$vsize F1, [track + 4x<1*32 * $dsize>];\n", + j0c15 => "20:-:2:-:1 \@P1 LDG.E.$vsize F2, [track + 4x<2*32 * $dsize>];\n", + + j0c16 => "04:-:-:-:1 F2F.F32.F16 F33, F31.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F32, F31.H0;\n", + j0c17 => "--:-:-:-:1 F2F.F32.F16 F31, F30.H1;\n" . + "--:-:3:-:1 F2F.F32.F16 F30, F30.H0;\n", + + j0c19 => "--:-:-:-:1 F2F.F32.F16 F43, F41.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F42, F41.H0;\n", + j0c20 => "--:-:-:-:1 F2F.F32.F16 F41, F40.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 F40, F40.H0;\n", + + j0c21 => "--:-:-:-:1 F2F.F32.F16 F53, F51.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F52, F51.H0;\n", + j0c22 => "--:-:-:-:1 F2F.F32.F16 F51, F50.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 F50, F50.H0;\n", + + j0c23 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], F3;\n", + j0c24 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], F4;\n", + j0c25 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], F5;\n", + + j0c27 => "04:-:-:-:1 \@P1 LDG.E.$vsize F3, [track + 4x<3*32 * $dsize>];\n", + j0c28 => "10:-:-:-:1 \@P1 LDG.E.$vsize F4, [track + 4x<4*32 * $dsize>];\n", + j0c29 => "20:-:3:-:1 \@P1 LDG.E.$vsize F5, [track + 4x<5*32 * $dsize>];\n", + + j0c30 => "08:-:-:-:1 F2F.F32.F16 F63, F61.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F62, F61.H0;\n", + j0c31 => "--:-:-:-:1 F2F.F32.F16 F61, F60.H1;\n" . + "--:-:4:-:1 F2F.F32.F16 F60, F60.H0;\n", + + j1c0 => "--:-:-:-:1 F2F.F32.F16 F73, F71.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F72, F71.H0;\n", + j1c1 => "--:-:-:-:1 F2F.F32.F16 F71, F70.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 F70, F70.H0;\n", + + j1c2 => "--:-:-:-:1 F2F.F32.F16 F83, F81.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F82, F81.H0;\n", + j1c3 => "--:-:-:-:1 F2F.F32.F16 F81, F80.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 F80, F80.H0;\n", + + j1c4 => "08:4:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], F6;\n", + j1c5 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], F7;\n", + j1c6 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], F8;\n", + + j1c8 => "08:-:-:-:1 \@P1 LDG.E.$vsize F6, [track + 4x<6*32 * $dsize>];\n", + j1c9 => "10:-:-:-:1 \@P1 LDG.E.$vsize F7, [track + 4x<7*32 * $dsize>];\n", + j1c10 => "20:6:4:-:1 \@P1 LDG.E.$vsize F8, [track + 4x<8*32 * $dsize>];\n", + + ) : ( + + j0c6 => "02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], F0;\n", + j0c8 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], F1;\n", + j0c10 => "--:2:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], F2;\n", + + j0c12 => "02:-:-:-:1 \@P1 LDG.E.$vsize F0, [track + 4x<0*32 * $dsize>];\n", + j0c14 => "--:-:-:-:1 \@P1 LDG.E.$vsize F1, [track + 4x<1*32 * $dsize>];\n", + j0c16 => "--:-:2:-:1 \@P1 LDG.E.$vsize F2, [track + 4x<2*32 * $dsize>];\n", + + j0c20 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], F3;\n", + j0c22 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], F4;\n", + j0c24 => "--:3:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], F5;\n", + + j0c26 => "04:-:-:-:1 \@P1 LDG.E.$vsize F3, [track + 4x<3*32 * $dsize>];\n", + j0c28 => "--:-:-:-:1 \@P1 LDG.E.$vsize F4, [track + 4x<4*32 * $dsize>];\n", + j0c30 => "--:-:3:-:1 \@P1 LDG.E.$vsize F5, [track + 4x<5*32 * $dsize>];\n", + + j1c0 => "08:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], F6;\n", + j1c2 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], F7;\n", + j1c4 => "--:4:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], F8;\n", + + j1c6 => "08:-:-:-:1 \@P1 LDG.E.$vsize F6, [track + 4x<6*32 * $dsize>];\n", + j1c8 => "--:-:-:-:1 \@P1 LDG.E.$vsize F7, [track + 4x<7*32 * $dsize>];\n", + j1c10 => "--:6:4:-:1 \@P1 LDG.E.$vsize F8, [track + 4x<8*32 * $dsize>];\n", + ), + + j1c11 => "--:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j1c31 => "--:-:-:Y:5 \@P0 BRA.U FILTER_LOOP;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "01" : '--'; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + +LOAD_FINISH: + +//--:-:-:-:5 EXIT; + + +--:-:1:-:2 S2R Tid, SR_TID.X; + +--:-:-:-:1 MOV alpha16, param_alpha; + +01:-:-:-:1 LOP.AND Tid32_2, Tid, -32; +--:-:-:-:1 SHR.U32 Tid32_2, Tid32_2, 2; + +// readFs = ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND Tid1, Tid, 1; +01:-:-:-:1 LOP.AND readFs, Tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD readFs, readFs, Tid1; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2) +--:-:-:-:1 BFE.U32 readIs, Tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, Tid32_2; +--:-:-:-:1 ISCADD readIs, readFs, readIs, 2; + +--:-:-:-:1 SHL readIs, readIs, 4; +--:-:-:-:1 SHL readFs, readFs, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:1 XMAD write16Cs, readFs, 1x<32*36>, readIs; + + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y2, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y2, alpha16; +--:-:-:-:4 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y3, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y3, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y6, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y6, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y7, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y7, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 EXIT; + + +COMPUTE_FINISH: + +//--:-:-:-:5 EXIT; + + +--:-:1:-:2 S2R tid_128, SR_TID.X; + + +01:-:-:-:1 IADD tid_128, tid_128, -128; + +--:-:-:-:1 ISETP.GE.AND P6, PT, tid_128, 256, PT; + +// readFs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND Tid_1, tid_128, 1; +--:-:-:-:1 LOP.AND readFs2, tid_128, 8; +--:-:-:-:1 SHR.U32 readFs2, readFs2, 2; +--:-:-:-:1 IADD readFs2, readFs2, Tid_1; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2) +--:-:-:-:1 LOP.AND tid_16, tid_128, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; +--:-:-:-:1 BFE.U32 readIs2, tid_128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs2, readIs2, tid_16; +--:-:-:-:1 ISCADD readIs2, readFs2, readIs2, 2; + +--:-:-:-:1 ISCADD readIs2, readIs2, 4x<32*4>, 4; +--:-:-:-:1 SHL readFs2, readFs2, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:0 XMAD writeCs, readFs2, 1x<32*36>, readIs2; + + +--:-:-:-:5 @P6 BRA.U SKIP0; + +--:-:2:-:1 LDS idxX, [addr_idx_X]; +--:-:3:-:1 LDS idxY, [addr_idx_Y]; +--:-:1:-:1 S2R idxN, SR_CTAID.Z; +--:-:4:-:1 LDS idxK, [addr_idx_K]; + + +--:-:-:-:1 LOP.AND tid_31, tid_128, 31; +--:-:-:-:1 SHR.U32 tid_32, tid_128, 5; +--:-:-:-:1 SHR.U32 tid_64, tid_128, 6; + + +// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16 +--:-:-:-:1 XMAD readCs, tid_32, 1x<32*36>, tid_31; +--:-:-:-:1 ISCADD readCs, tid_64, readCs, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// Superblock offset +// idxX <<= shiftX +// idxX <<= shiftY +04:-:-:-:1 SHL idxY, idxY, param_shiftY; +02:-:-:-:1 SHL idxX, idxX, param_shiftX; +01:-:-:-:1 SHL idxN, idxN, param_shiftN; + +// Get this threads offset within the superblock +--:-:-:-:1 BFE.U32 p, tid_31, param_SuperY; +--:-:-:-:1 BFE.U32 q, tid_31, param_SuperX; +--:-:-:-:1 LOP.AND n, tid_31, param_SuperN; + +--:-:-:-:1 ISCADD q, q, idxX, 1; +--:-:-:-:1 ISCADD p, p, idxY, 1; + +--:-:-:-:1 MOV four, -4; +--:-:-:-:1 IADD3 q, q, param_pad_x, four; +--:-:-:-:1 IADD3 p, p, param_pad_y, four; + +[+ + our ($type, $N); + if ($type eq 'h') + { + return q{ +--:-:-:-:1 SHL tid31_4, tid_31, 2; + +--:-:-:-:1 ISCADD n, n, idxN, 1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tid_31, 16, PT; + } + } + else { + return q{ +--:-:-:-:1 IADD n, n, idxN; +--:-:-:-:1 ISETP.LT.AND P6, PT, n, param_N, PT; + }; + } ++] + +// k = idxK*32 + tid_32<<1 +--:-:-:-:1 SHL tid_32, tid_32, 1; +08:-:-:-:1 ISCADD k, idxK, tid_32, 5; + +// Out = k*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD.S16.U16 offsetO, q, param_N, n; +--:-:-:-:1 XMAD.S16.U16.LO2C offsetO, p, param_QN, offsetO; +--:-:-:-:1 XMAD.U16.U16.LO2C offsetO, k, param_PQN, offsetO; +--:-:-:-:1 ISET.LT.AND sign, offsetO, RZ, PT; + +--:-:-:-:1 LEA Out0.CC, offsetO, param_O[0], [+ dshift() +]; +--:-:-:-:1 IADD.X Out1, sign, param_O[1]; + +--:-:-:-:1 ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op + +--:-:-:-:1 IADD z1, q, 1; +--:-:-:-:1 IADD z2, q, 2; +--:-:-:-:1 IADD z3, q, 3; +--:-:-:-:1 IADD z4, q, 4; +--:-:-:-:1 IADD z5, q, 5; +--:-:-:-:1 ISETP.LT.AND P0, PT, q, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, z4, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P5, PT, z5, param_Q, P5; +--:-:-:-:1 ISETP.GE.AND P0, PT, q, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, z4, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, z5, RZ, P5; +--:-:-:-:1 P2R mask_q, PR, RZ, 0x3f; + +--:-:-:-:1 IADD z1, p, 1; +--:-:-:-:1 IADD z2, p, 2; +--:-:-:-:1 IADD z3, p, 3; +--:-:-:-:1 IADD z4, p, 4; +--:-:-:-:1 IADD z5, p, 5; +--:-:-:-:1 ISETP.LT.AND P0, PT, p, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P4, PT, z4, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, z5, param_P, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, p, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, z4, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, z5, RZ, P5; + +--:-:-:-:1 SEL pred30, mask_q, RZ, P0; +--:-:-:-:1 @P1 BFI pred30, mask_q, 0x606, pred30; +--:-:-:-:1 @P2 BFI pred30, mask_q, 0x60c, pred30; +--:-:-:-:1 @P3 BFI pred30, mask_q, 0x612, pred30; +--:-:-:-:1 @P4 BFI pred30, mask_q, 0x618, pred30; +--:-:-:-:1 SEL pred36, mask_q, RZ, P5; + +--:-:-:-:1 ISETP.GE.AND P6, PT, tid_128, 256, PT; + + + +SKIP0: + + +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x7y0, ccx7y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y1, ccx3y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x7y1, ccx7y2, param_alpha; + +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P6 BRA.U SKIP1; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP1: + +--:-:-:-:0 IADD k, k, 1; +--:-:-:-:5 BAR.SYNC 0; +01:-:-:-:1 IADD Out0.CC, Out0, param_PQNp; +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y1, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y1, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y3, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y3, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y3, param_alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y3, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y3, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y3, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y3, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y3, param_alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:0 IADD.X Out1, Out1, RZ; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P6 BRA.U SKIP2; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP2: + +--:-:-:-:0 IADD k, k, 15; +--:-:-:-:5 BAR.SYNC 0; +01:-:-:-:1 IADD Out0.CC, Out0, param_PQN15p; +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y4, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y4, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y6, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y6, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y6, param_alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y6, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y6, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y6, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y6, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y6, param_alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:0 IADD.X Out1, Out1, RZ; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P6 BRA.U SKIP3; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP3: + +--:-:-:-:0 IADD k, k, 1; +--:-:-:-:5 BAR.SYNC 0; +01:-:-:-:1 IADD Out0.CC, Out0, param_PQNp; +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y5, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y5, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y7, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y7, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y7, param_alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y7, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y7, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y7, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y7, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y7, param_alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:0 IADD.X Out1, Out1, RZ; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P6 BRA.U SKIP4;S +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP4: + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + + +01:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, PT; +--:-:-:-:1 @!P0 MOV pred30, RZ; +--:-:-:-:1 @!P0 MOV pred36, RZ; +[+ + my $out; + foreach my $i (0 .. 2) + { + foreach my $j (0 .. 5) + { + my $b = $i + 1; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + + + +// t0 = I[1,:] + I[2,:] +// t1 = I[1,:] - I[2,:] +// t2 = I[3,:] + I[4,:] +// t3 = I[3,:] - I[4,:] +// O[2,:] = t0 * -2.25 + t2 * -0.5625 + I[0,:] * -2.8125 +// O[1,:] = t1 * -1.6875 + t3 * -0.84375 + I[5,:] * 1.265625 +// O[3,:] = t1 * 0.75 + t3 * 1.5 + I[5,:] * -2.8125 +// O[4,:] = I[0,:] + t0 + t2 +// O[0,:] = I[0,:] * 1.265625 +// O[5,:] = I[5,:] +[+ + my $out; + foreach my $i (0 .. 2) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ +$w:-:-:-:1 FADD t0$i, m1$i, m2$i; +--:-:-:-:1 FADD t1$i, m1$i, -m2$i; +--:-:-:-:1 FADD t2$i, m3$i, m4$i; +--:-:-:-:1 FADD t3$i, m3$i, -m4$i; +--:-:-:-:1 FMUL w2$i, m0$i, -2.8125; +--:-:-:-:1 FFMA w2$i, t0$i, -2.25, w2$i; +--:-:-:-:1 FFMA w2$i, t2$i, -0.5625, w2$i; +--:-:-:-:1 FMUL w1$i, m5$i, 1.265625; +--:-:-:-:1 FFMA w1$i, t1$i, -1.6875, w1$i; +--:-:-:-:1 FFMA w1$i, t3$i, -0.84375, w1$i; +--:-:-:-:1 FMUL w3$i, m5$i, -2.8125; +--:-:-:-:1 FFMA w3$i, t1$i, 0.75, w3$i; +--:-:-:-:1 FFMA w3$i, t3$i, 1.5, w3$i; +--:-:-:-:1 FADD w4$i, m0$i, t0$i; +--:-:-:-:1 FADD w4$i, w4$i, t2$i; +--:-:-:-:1 FMUL w0$i, m0$i, 1.265625; + }; + } + foreach my $i (3 .. 5) + { + foreach my $j (0 .. 5) + { + my $b = $i + 1; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + + + +[+ + my $out; + foreach my $i (3 .. 5) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ +$w:-:-:-:1 FADD t0$i, m1$i, m2$i; +--:-:-:-:1 FADD t1$i, m1$i, -m2$i; +--:-:-:-:1 FADD t2$i, m3$i, m4$i; +--:-:-:-:1 FADD t3$i, m3$i, -m4$i; +--:-:-:-:1 FMUL w2$i, m0$i, -2.8125; +--:-:-:-:1 FFMA w2$i, t0$i, -2.25, w2$i; +--:-:-:-:1 FFMA w2$i, t2$i, -0.5625, w2$i; +--:-:-:-:1 FMUL w1$i, m5$i, 1.265625; +--:-:-:-:1 FFMA w1$i, t1$i, -1.6875, w1$i; +--:-:-:-:1 FFMA w1$i, t3$i, -0.84375, w1$i; +--:-:-:-:1 FMUL w3$i, m5$i, -2.8125; +--:-:-:-:1 FFMA w3$i, t1$i, 0.75, w3$i; +--:-:-:-:1 FFMA w3$i, t3$i, 1.5, w3$i; +--:-:-:-:1 FADD w4$i, m0$i, t0$i; +--:-:-:-:1 FADD w4$i, w4$i, t2$i; +--:-:-:-:1 FMUL w0$i, m0$i, 1.265625; + }; + } + return $out; ++] +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; + + + +[+ + my $out; + foreach my $i (0 .. 5) + { + $out .= qq{ +--:-:-:-:1 FADD r${i}0, w${i}1, w${i}2; +--:-:-:-:1 FADD r${i}1, w${i}1, -w${i}2; +--:-:-:-:1 FADD r${i}2, w${i}3, w${i}4; +--:-:-:-:1 FADD r${i}3, w${i}3, -w${i}4; +--:-:-:-:1 FMUL s${i}2, w${i}0, -2.8125; +--:-:-:-:1 FFMA s${i}2, r${i}0, -2.25, s${i}2; +--:-:-:-:1 FFMA s${i}2, r${i}2, -0.5625, s${i}2; +--:-:-:-:1 FMUL s${i}1, w${i}5, 1.265625; +--:-:-:-:1 FFMA s${i}1, r${i}1, -1.6875, s${i}1; +--:-:-:-:1 FFMA s${i}1, r${i}3, -0.84375, s${i}1; +--:-:-:-:1 FMUL s${i}3, w${i}5, -2.8125; +--:-:-:-:1 FFMA s${i}3, r${i}1, 0.75, s${i}3; +--:-:-:-:1 FFMA s${i}3, r${i}3, 1.5, s${i}3; +--:-:-:-:1 FADD s${i}4, w${i}0, r${i}0; +--:-:-:-:1 FADD s${i}4, s${i}4, r${i}2; +--:-:-:-:1 FMUL s${i}0, w${i}0, 1.265625; + }; + } + return $out; ++] +[+ + our $type; + return $type eq 'h' ? q{ + +--:-:-:-:1 IADD readCs, readCs, -tid31_4; +--:-:-:-:1 SHR.U32 tid31_4, tid31_4, 1; +--:-:-:-:1 IADD readCs, readCs, tid31_4; + + +--:-:-:-:1 F2F.F16.F32 s05, s05; +--:-:-:-:1 F2F.F16.F32 s00, s00; +--:-:-:-:1 F2F.F16.F32 s02, s02; +--:-:-:-:1 F2F.F16.F32 s01, s01; +--:-:-:-:1 F2F.F16.F32 s03, s03; +--:-:1:-:1 F2F.F16.F32 s04, s04; + +--:-:-:-:1 F2F.F16.F32 s15, s15; +--:-:-:-:1 F2F.F16.F32 s10, s10; +--:-:-:-:1 F2F.F16.F32 s12, s12; +--:-:-:-:1 F2F.F16.F32 s11, s11; +--:-:-:-:1 F2F.F16.F32 s13, s13; +--:-:2:-:1 F2F.F16.F32 s14, s14; + +01:-:-:-:1 STS.U16 [readCs + 4x<(0*6+0)*32>], s00; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+1)*32>], s01; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+2)*32>], s02; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+3)*32>], s03; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+4)*32>], s04; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+5)*32>], s05; + +--:-:-:-:1 F2F.F16.F32 s25, s25; +--:-:-:-:1 F2F.F16.F32 s20, s20; +--:-:-:-:1 F2F.F16.F32 s22, s22; +--:-:-:-:1 F2F.F16.F32 s21, s21; +--:-:-:-:1 F2F.F16.F32 s23, s23; +--:-:3:-:1 F2F.F16.F32 s24, s24; + +02:-:-:-:1 STS.U16 [readCs + 4x<(1*6+0)*32>], s10; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+1)*32>], s11; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+2)*32>], s12; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+3)*32>], s13; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+4)*32>], s14; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+5)*32>], s15; + +--:-:-:-:1 F2F.F16.F32 s35, s35; +--:-:-:-:1 F2F.F16.F32 s30, s30; +--:-:-:-:1 F2F.F16.F32 s32, s32; +--:-:-:-:1 F2F.F16.F32 s31, s31; +--:-:-:-:1 F2F.F16.F32 s33, s33; +--:-:4:-:1 F2F.F16.F32 s34, s34; + +04:-:-:-:1 STS.U16 [readCs + 4x<(2*6+0)*32>], s20; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+1)*32>], s21; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+2)*32>], s22; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+3)*32>], s23; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+4)*32>], s24; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+5)*32>], s25; + +--:-:-:-:1 F2F.F16.F32 s45, s45; +--:-:-:-:1 F2F.F16.F32 s40, s40; +--:-:-:-:1 F2F.F16.F32 s42, s42; +--:-:-:-:1 F2F.F16.F32 s41, s41; +--:-:-:-:1 F2F.F16.F32 s43, s43; +--:-:5:-:1 F2F.F16.F32 s44, s44; + +08:-:-:-:1 STS.U16 [readCs + 4x<(3*6+0)*32>], s30; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+1)*32>], s31; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+2)*32>], s32; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+3)*32>], s33; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+4)*32>], s34; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+5)*32>], s35; + +--:-:-:-:1 F2F.F16.F32 s55, s55; +--:-:-:-:1 F2F.F16.F32 s50, s50; +--:-:-:-:1 F2F.F16.F32 s52, s52; +--:-:-:-:1 F2F.F16.F32 s51, s51; +--:-:-:-:1 F2F.F16.F32 s53, s53; +--:-:6:-:1 F2F.F16.F32 s54, s54; + +10:-:-:-:1 STS.U16 [readCs + 4x<(4*6+0)*32>], s40; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+1)*32>], s41; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+2)*32>], s42; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+3)*32>], s43; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+4)*32>], s44; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+5)*32>], s45; + +20:-:-:-:1 STS.U16 [readCs + 4x<(5*6+0)*32>], s50; +--:-:-:-:1 STS.U16 [readCs + 4x<(5*6+1)*32>], s51; +--:-:-:-:1 STS.U16 [readCs + 4x<(5*6+2)*32>], s52; +--:-:-:-:1 STS.U16 [readCs + 4x<(5*6+3)*32>], s53; +--:-:-:-:1 STS.U16 [readCs + 4x<(5*6+4)*32>], s54; +--:1:-:-:2 STS.U16 [readCs + 4x<(5*6+5)*32>], s55; // FORCE + + +01:-:-:-:1 IADD readCs, readCs, -tid31_4; +--:-:-:-:1 SHL tid31_4, tid31_4, 1; +--:-:-:-:4 IADD readCs, readCs, tid31_4; + + } : q{ +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 0*$N>], s00; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 1*$N>], s01; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 2*$N>], s02; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 3*$N>], s03; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 4*$N>], s04; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 5*$N>], s05; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 0*$N>], s10; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 1*$N>], s11; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 2*$N>], s12; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 3*$N>], s13; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 4*$N>], s14; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 5*$N>], s15; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 0*$N>], s20; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 1*$N>], s21; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 2*$N>], s22; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 3*$N>], s23; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 4*$N>], s24; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 5*$N>], s25; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 0*$N>], s30; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 1*$N>], s31; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 2*$N>], s32; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 3*$N>], s33; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 4*$N>], s34; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 5*$N>], s35; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.L.U64 pred30, pred30, 24, pred30; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 0*$N>], s40; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 1*$N>], s41; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 2*$N>], s42; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 3*$N>], s43; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 4*$N>], s44; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 5*$N>], s45; +--:-:-:-:1 R2P PR, pred36, 0x3f; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 0*$N>], s50; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 1*$N>], s51; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 2*$N>], s52; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 3*$N>], s53; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 4*$N>], s54; +--:1:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 5*$N>], s55; + }; ++] + + +[+ + our $type; + return $type eq 'h' ? q{ +--:-:-:-:1 LDS.U.32 s00, [readCs + 4x<(0*6+0)*32>]; +--:-:-:-:1 LDS.U.32 s01, [readCs + 4x<(0*6+1)*32>]; +--:-:-:-:1 LDS.U.32 s02, [readCs + 4x<(0*6+2)*32>]; +--:-:-:-:1 LDS.U.32 s03, [readCs + 4x<(0*6+3)*32>]; +--:-:-:-:1 LDS.U.32 s04, [readCs + 4x<(0*6+4)*32>]; +--:-:1:-:1 LDS.U.32 s05, [readCs + 4x<(0*6+5)*32>]; + +--:-:-:-:1 LDS.U.32 s10, [readCs + 4x<(1*6+0)*32>]; +--:-:-:-:1 LDS.U.32 s11, [readCs + 4x<(1*6+1)*32>]; +--:-:-:-:1 LDS.U.32 s12, [readCs + 4x<(1*6+2)*32>]; +--:-:-:-:1 LDS.U.32 s13, [readCs + 4x<(1*6+3)*32>]; +--:-:-:-:1 LDS.U.32 s14, [readCs + 4x<(1*6+4)*32>]; +--:-:2:-:1 LDS.U.32 s15, [readCs + 4x<(1*6+5)*32>]; + +--:-:-:-:1 LDS.U.32 s20, [readCs + 4x<(2*6+0)*32>]; +--:-:-:-:1 LDS.U.32 s21, [readCs + 4x<(2*6+1)*32>]; +--:-:-:-:1 LDS.U.32 s22, [readCs + 4x<(2*6+2)*32>]; +--:-:-:-:1 LDS.U.32 s23, [readCs + 4x<(2*6+3)*32>]; +--:-:-:-:1 LDS.U.32 s24, [readCs + 4x<(2*6+4)*32>]; +--:-:3:-:1 LDS.U.32 s25, [readCs + 4x<(2*6+5)*32>]; + + + +01:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 0*$N>], s00; +--:-:-:-:1 LDS.U.32 s30, [readCs + 4x<(3*6+0)*32>]; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 1*$N>], s01; +--:-:-:-:1 LDS.U.32 s31, [readCs + 4x<(3*6+1)*32>]; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 2*$N>], s02; +--:-:-:-:1 LDS.U.32 s32, [readCs + 4x<(3*6+2)*32>]; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 3*$N>], s03; +--:-:-:-:1 LDS.U.32 s33, [readCs + 4x<(3*6+3)*32>]; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 4*$N>], s04; +--:-:-:-:1 LDS.U.32 s34, [readCs + 4x<(3*6+4)*32>]; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 5*$N>], s05; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:4:-:1 LDS.U.32 s35, [readCs + 4x<(3*6+5)*32>]; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 LDS.U.32 s40, [readCs + 4x<(4*6+0)*32>]; +02:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 0*$N>], s10; +--:-:-:-:1 LDS.U.32 s41, [readCs + 4x<(4*6+1)*32>]; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 1*$N>], s11; +--:-:-:-:1 LDS.U.32 s42, [readCs + 4x<(4*6+2)*32>]; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 2*$N>], s12; +--:-:-:-:1 LDS.U.32 s43, [readCs + 4x<(4*6+3)*32>]; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 3*$N>], s13; +--:-:-:-:1 LDS.U.32 s44, [readCs + 4x<(4*6+4)*32>]; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 4*$N>], s14; +--:-:5:-:1 LDS.U.32 s45, [readCs + 4x<(4*6+5)*32>]; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 5*$N>], s15; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 LDS.U.32 s50, [readCs + 4x<(5*6+0)*32>]; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 LDS.U.32 s51, [readCs + 4x<(5*6+1)*32>]; +04:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 0*$N>], s20; +--:-:-:-:1 LDS.U.32 s52, [readCs + 4x<(5*6+2)*32>]; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 1*$N>], s21; +--:-:-:-:1 LDS.U.32 s53, [readCs + 4x<(5*6+3)*32>]; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 2*$N>], s22; +--:-:-:-:1 LDS.U.32 s54, [readCs + 4x<(5*6+4)*32>]; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 3*$N>], s23; +--:-:6:-:1 LDS.U.32 s55, [readCs + 4x<(5*6+5)*32>]; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 4*$N>], s24; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 5*$N>], s25; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; + +08:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 0*$N>], s30; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 1*$N>], s31; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 2*$N>], s32; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 3*$N>], s33; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 4*$N>], s34; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 5*$N>], s35; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.L.U64 pred30, pred30, 24, pred30; +10:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 0*$N>], s40; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 1*$N>], s41; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 2*$N>], s42; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 3*$N>], s43; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 4*$N>], s44; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 5*$N>], s45; +--:-:-:-:1 R2P PR, pred36, 0x3f; +20:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 0*$N>], s50; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 1*$N>], s51; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 2*$N>], s52; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 3*$N>], s53; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 4*$N>], s54; +--:1:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 5*$N>], s55; + + + } : ''; ++] + +--:-:-:-:5 RET; + +// RED.E.ADD.F16x2.FTZ.RN \ No newline at end of file diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_3x3_2x2_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_3x3_2x2_32x32.sass new file mode 100644 index 0000000..fe1dc07 --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_winograd_3x3_2x2_32x32.sass @@ -0,0 +1,1814 @@ + +# Copyright 2015 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our ($type, $IX, $D); +our $determ = $D; +our $dtype = $type eq 'h' ? '.U16' : ''; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $vec_size = $type eq 'h' ? '64' : '128'; +our $dtype_shift = $type eq 'h' ? '1' : '2'; +our $dtype_size = $type eq 'h' ? '2' : '4'; +sub dtype { return $dtype; } +sub dtype_shift { return $dtype_shift; } +sub output_op { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; } +-] + + + + addr_zero : 4x<(512*4 + 32)*4 + 0> + addr_blk_K : 4x<(512*4 + 32)*4 + 4> + addr_blk_C : 4x<(512*4 + 32)*4 + 5> + addr_blk_P : 4x<(512*4 + 32)*4 + 6> + addr_blk_Q : 4x<(512*4 + 32)*4 + 7> + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_Y : c[0x0][0x15c] + param_X : c[0x0][0x160] + param_P : c[0x0][0x164] + param_Q : c[0x0][0x168] + param_C : c[0x0][0x16c] + param_K : c[0x0][0x170] + param_N : c[0x0][0x174] + param_pad_y : c[0x0][0x178] + param_pad_x : c[0x0][0x17c] + param_GY : c[0x0][0x180] + param_GX : c[0x0][0x184] + param_GYS : c[0x0][0x188] + param_GXS : c[0x0][0x18c] + param_shiftYI : c[0x0][0x190] + param_shiftXI : c[0x0][0x194] + param_superYI : c[0x0][0x198] + param_superXI : c[0x0][0x19c] + param_superNI : c[0x0][0x1a0] + param_shiftY : c[0x0][0x1a4] + param_shiftX : c[0x0][0x1a8] + param_superY : c[0x0][0x1ac] + param_superX : c[0x0][0x1b0] + param_superN : c[0x0][0x1b4] + param_loopXI : c[0x0][0x1b8] + param_loopX : c[0x0][0x1bc] + param_loopN : c[0x0][0x1c0] + param_strideY : c[0x0][0x1c4] + param_strideX : c[0x0][0x1c8] + param_XN : c[0x0][0x1cc] + param_YXN : c[0x0][0x1d0] + param_QN : c[0x0][0x1d4] + param_PQN : c[0x0][0x1d8] + param_SK : c[0x0][0x1dc] + param_RSK : c[0x0][0x1e0] + param_Np : c[0x0][0x1e4] + param_XNp : c[0x0][0x1e8] + param_2XNp : c[0x0][0x1ec] + param_QNp : c[0x0][0x1f0] + param_CPQkc : c[0x0][0x1f4] + param_PQkc : c[0x0][0x1f8] + param_Qkc : c[0x0][0x1fc] + param_kc : c[0x0][0x200] + param_c : c[0x0][0x204] + param_k : c[0x0][0x208] + param_magic_CPQkc : c[0x0][0x20c] + param_shift_CPQkc : c[0x0][0x210] + param_magic_PQkc : c[0x0][0x214] + param_shift_PQkc : c[0x0][0x218] + param_magic_Qkc : c[0x0][0x21c] + param_shift_Qkc : c[0x0][0x220] + param_magic_kc : c[0x0][0x224] + param_shift_kc : c[0x0][0x228] + param_magic_c : c[0x0][0x22c] + param_shift_c : c[0x0][0x230] + param_CRSK : c[0x0][0x234] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 64-79 ~ blk_KCPQkc, blk_CPQkc, blk_PQkc, blk_Qkc, blk_kc, blk_k, blk_c, blk_K, blk_C, blk_P, magic_CPQkc, magic_PQkc, magic_Qkc + 84-95 ~ div1, div2, div3, tidX, tidY, tid16, tid1, neg_CPQkc, neg_PQkc, neg_Qkc, neg_kc, neg_c + + 80-82 : init, tid, blk_Q + 83 = blkC, blkK + 84-95 ~ x, x<1-3>, y, super_x, super_y, tid_X, c, offsign, mask_x, mask_y + 84-95 ~ nloop, N + 81 = off_sign + 64 = swapBuf + + 96-103 : track0<0-1>, track1<0-1>, track2<0-1>, track3<0-1> + + 120-127 ~ writeS, readEs, readIs, pred_bits, gys, gxs, n, offset + + 0-31 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3>, t0<0-3>, t1<0-3>, t2<0-3> + 64-72 : f0<0-2>, f1<0-2>, f2<0-2> + 76-79 : blkKCPQ<0-3> + 76-79 : K_blk, C_blk, P_blk, Q_blk + 84-95 ~ CRSK, xmad_determ, PQ_blk + 96-109 ~ alpha, writeCs, readCs, cc, RSK8, tid_1, tid_16, tid_31, tid_32, kk, trackF, K1, SK1 + 110-115 : F00_<0-1>, F01_<0-1>, F02_<0-1>, + 116-121 : F10_<0-1>, F11_<0-1>, F12_<0-1>, + 122-127 : F20_<0-1>, F21_<0-1>, F22_<0-1> +[+ + our $IX; + return $IX ? q{ + 96-99 : trackI<0-1>, offsetI<0-1> + 100-103 ~ swapBuffer, gy, gx + + 104-119 : I0<0-3>, I1<0-3>, I2<0-3>, I3<0-3> + } : q{ + // registers reorded to avoid bank conflicts + 104 = y0x0, Y0X0, I00, Y1X0 + 105 = y0x1, Y0X1, I02, Y1X2 + 106 = y0x2, Y0X2, I13 + 107 = y0x3, Y0X3, I03, Y1X3 + 108 = y1x0, I04 + 110 = y1x1, I05 + 109 = y1x2, I06 + 111 = y1x3, I07 + 113 = y2x0, Y2X0, I08 + 112 = y2x1, Y2X1 + 119 = y2x2, Y2X2, I10 + 117 = y2x3, Y2X3, I11 + 115 = y3x0, Y3X0, I12 + 116 = y3x1, Y3X1, I14 + 114 = y3x2, Y3X2, I09 + 118 = y3x3, Y3X3, I15 + 80 = I01 + 64 = Y1X1 + }; ++] + // Error registers + 104 = p0q0, E00 + 105 = p0q1, E03 + 106 = p1q0, E12 + 107 = p1q1, E15 + 108 = e0, C0, E08 + 109 = E01 + 110 = E02 + 111 = e1, C1, E11 + 112 = E13 + 113 = E14 + 114 = B0, E04 + 115 = B1, E07 + 116 = e2, E06 + 117 = e3, E10 + 118 = E05 + 119 = E09 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blk_KCPQkc, SR_CTAID.X; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 128, PT; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + + +--:-:-:-:1 MOV magic_CPQkc, param_magic_CPQkc; +--:-:-:-:1 MOV magic_PQkc, param_magic_PQkc; +--:-:-:-:1 MOV magic_Qkc, param_magic_Qkc; +--:-:-:-:1 IADD neg_CPQkc, RZ, -param_CPQkc; +--:-:-:-:1 IADD neg_PQkc, RZ, -param_PQkc; +--:-:-:-:1 IADD neg_Qkc, RZ, -param_Qkc; +--:-:-:-:1 IADD neg_kc, RZ, -param_kc; +--:-:-:-:1 IADD neg_c, RZ, -param_c; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_CPQkc, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magic_PQkc, 1, PT; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_Qkc, 1, PT; + +// blk_K = blk_KCPQkc / CPQkc +02:-:-:-:1 @P1 XMAD div1, blk_KCPQkc, magic_CPQkc, RZ; +--:-:-:-:1 @P1 XMAD div2, blk_KCPQkc, magic_CPQkc.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blk_KCPQkc.H1, magic_CPQkc.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blk_KCPQkc.H1, magic_CPQkc, div1; +--:-:-:-:1 @P1 IADD3.RS blk_K, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 blk_K, blk_K, param_shift_CPQkc; +--:-:-:-:1 @!P1 SHR.U32 blk_K, blk_KCPQkc, param_shift_CPQkc; + +// blk_CPQkc = blk_KCPQkc % CPQkc +--:-:-:-:1 XMAD.LO2 blk_CPQkc, neg_CPQkc, blk_K, blk_KCPQkc; + +// blk_C = blk_CPQkc / PQkc +--:-:-:-:1 @P2 XMAD div1, blk_CPQkc, magic_PQkc, RZ; +--:-:-:-:1 @P2 XMAD div2, blk_CPQkc, magic_PQkc.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, blk_CPQkc.H1, magic_PQkc.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, blk_CPQkc.H1, magic_PQkc, div1; +--:-:-:-:1 @P2 IADD3.RS blk_C, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 blk_C, blk_C, param_shift_PQkc; +--:-:-:-:1 @!P2 SHR.U32 blk_C, blk_CPQkc, param_shift_PQkc; + +// blk_PQkc = blk_CPQkc % PQkc +--:-:-:-:1 XMAD.LO2 blk_PQkc, neg_PQkc, blk_C, blk_CPQkc; + +// blk_P = blk_PQkc / Qkc +--:-:-:-:1 @P3 XMAD div1, blk_PQkc, magic_Qkc, RZ; +--:-:-:-:1 @P3 XMAD div2, blk_PQkc, magic_Qkc.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, blk_PQkc.H1, magic_Qkc.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, blk_PQkc.H1, magic_Qkc, div1; +--:-:-:-:1 @P3 IADD3.RS blk_P, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 blk_P, blk_P, param_shift_Qkc; +--:-:-:-:1 @!P3 SHR.U32 blk_P, blk_PQkc, param_shift_Qkc; + +// blk_Qkc = blk_PQkc % Qkc +--:-:-:-:1 XMAD.LO2 blk_Qkc, neg_Qkc, blk_P, blk_PQkc; + +// blk_Q = blk_Qkc / kc +--:-:-:-:1 XMAD.LO2C blk_Q, blk_Qkc, param_magic_kc, RZ; +--:-:-:-:1 SHR.U32 blk_Q, blk_Q, param_shift_kc; +// blk_kc = blk_Qkc % kc +--:-:-:-:1 XMAD.S16.U16 blk_kc, neg_kc, blk_Q, blk_Qkc; + +// blk_k = blk_kc / c +--:-:-:-:1 XMAD blk_k, blk_kc, param_magic_c, RZ; +--:-:-:-:1 SHR.U32 blk_k, blk_k, param_shift_c; +// blk_c = blk_kc % c +--:-:-:-:1 XMAD.S16.U16 blk_c, neg_c, blk_k, blk_kc; + +// blk_K = blk_K*param_k + blk_k +--:-:-:-:1 XMAD blk_K, blk_K, param_k, blk_k; +// blk_C = blk_C*param_c + blk_c +--:-:-:-:1 XMAD blk_C, blk_C, param_c, blk_c; + +// Spill these block constants to shared +--:-:-:-:1 ISETP.EQ.AND P5, PT, tid, RZ, PT; +--:-:-:-:1 @P5 STS [addr_blk_K], blk_K; +--:-:-:-:1 @P5 STS [addr_blk_C], blk_C; +--:-:-:-:1 @P5 STS [addr_blk_P], blk_P; +--:-:-:-:1 @P5 STS [addr_blk_Q], blk_Q; + +// gxs = blk_Q +// gys = blk_P +--:-:-:-:1 MOV gxs, blk_Q; +--:-:-:-:1 MOV gys, blk_P; + +[+ + our $IX; + return $IX ? '' : q{ +--:-:-:-:1 BFE.U32 n, tid, param_superN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +// tidX = (tid & 127) >> 2 +// tidY = tid & 3 +// writeS = tidY*512 + tidX + (tidY << 3) +--:-:-:-:1 BFE.U32 tidX, tid, 0x502; // 5 bits at position 2 +--:-:-:-:1 LOP.AND tidY, tid, 3; +--:-:-:-:1 ISCADD writeS, tidY, tidX, 9; +--:-:-:-:1 ISCADD writeS, tidY, writeS, 3; +--:-:-:-:1 SHL writeS, writeS, 2; + }; ++] + +// readEs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readIs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readIs, tid, 8; +--:-:-:-:1 SHR.U32 readIs, readIs, 2; +--:-:-:-:1 LOP3.LUT readIs, readIs, tid16, tid1, 0xfe; +--:-:-:-:1 SHL readIs, readIs, 4; + +--:-:-:-:1 BFE.U32 readEs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readEs, readEs, tid16; +--:-:-:-:1 ISCADD readEs, readEs, 4x<512*4 + 32>, 4; + + +--:-:-:-:5 @P0 BRA.U ERROR_SETUP; + +[+ + our ($IX, $dtype_shift); + return $IX ? qq{ + +--:-:-:-:1 MOV swapBuffer, 4x<(512*4 + 32)*2>; + +// tidY = (tid & 127) / 32 +--:-:-:-:1 BFE.U32 tidY, tid, 0x205; // 2 bits at position 5 +--:-:-:-:1 BFE.U32 n, tid, param_superNI; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +// writeS = (tidY*512 + (tid & 31)*4)*4 +--:-:-:-:1 LOP.AND tidX, tid, 31; +--:-:-:-:1 SHL writeS, tidX, 4; +--:-:-:-:1 ISCADD writeS, tidY, writeS, 11; +// offsetI = I + (tid & 31)*4 +--:-:-:-:1 LEA offsetI0.CC, tidX, param_I[0], 1x<$dtype_shift + 2>; +--:-:-:-:1 LEA.HI.X offsetI1, tidX, param_I[1], RZ, 1x<$dtype_shift + 2>; + + + } : ''; ++] + +--:-:-:-:0 MOV blkC, blk_C; + +// IMAGE_SETUP +--:-:-:-:5 CAL IMAGE_OFFSET; +--:-:-:-:5 CAL IMAGE_LOAD; +--:-:-:-:5 CAL IMAGE_OFFSET; + +[+ + our ($convert_in, $IX); + if ($convert_in) + { + my $out = $IX ? qq{ +02:-:-:-:1 $convert_in I03, I01.H1; +--:-:-:-:1 $convert_in I02, I01.H0; +--:-:-:-:1 $convert_in I01, I00.H1; +--:-:2:-:1 $convert_in I00, I00.H0; + +04:-:-:-:1 $convert_in I13, I11.H1; +--:-:-:-:1 $convert_in I12, I11.H0; +--:-:-:-:1 $convert_in I11, I10.H1; +--:-:3:-:1 $convert_in I10, I10.H0; + +08:-:-:-:1 $convert_in I23, I21.H1; +--:-:-:-:1 $convert_in I22, I21.H0; +--:-:-:-:1 $convert_in I21, I20.H1; +--:-:4:-:1 $convert_in I20, I20.H0; + +10:-:-:-:1 $convert_in I33, I31.H1; +--:-:-:-:1 $convert_in I32, I31.H0; +--:-:-:-:1 $convert_in I31, I30.H1; +--:-:5:-:1 $convert_in I30, I30.H0; + } : qq{ +02:-:-:-:1 $convert_in y0x0, y0x0; +--:-:-:-:1 $convert_in y0x1, y0x1; +--:-:-:-:1 $convert_in y0x2, y0x2; +--:-:2:-:1 $convert_in y0x3, y0x3; + +04:-:-:-:1 $convert_in y2x0, y2x0; +--:-:-:-:1 $convert_in y2x1, y2x1; +--:-:-:-:1 $convert_in y2x2, y2x2; +--:-:3:-:1 $convert_in y2x3, y2x3; + +08:-:-:-:1 $convert_in y1x0, y1x0; +--:-:-:-:1 $convert_in y1x1, y1x1; +--:-:-:-:1 $convert_in y1x2, y1x2; +--:-:4:-:1 $convert_in y1x3, y1x3; + +10:-:-:-:1 $convert_in y3x0, y3x0; +--:-:-:-:1 $convert_in y3x1, y3x1; +--:-:-:-:1 $convert_in y3x2, y3x2; +--:-:5:-:1 $convert_in y3x3, y3x3; + }; + return qq{ + + +$out + +--:-:-:-:1 NOP; # we need 20 total conversions. that's 4 short of instruction 2 cache lines +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; + + }; + } + return ''; ++] + +[+ + our $IX; + return $IX ? q{ +02:-:-:-:1 STS.128 [writeS + 4x<00*4>], I0; +04:-:-:-:1 STS.128 [writeS + 4x<32*4>], I1; +08:-:-:-:1 STS.128 [writeS + 4x<64*4>], I2; +10:-:-:-:1 STS.128 [writeS + 4x<96*4>], I3; + +// init = bNextY ? 1 : 0 +--:-:-:-:0 SEL pred_bits, RZ, 1, !P6; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuffer; +--:-:-:-:0 IADD swapBuffer, RZ, -swapBuffer; + +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>]; + +--:-:-:-:5 CAL IMAGE_LOAD; + +// init += bNextY ? 1 : 0 +--:-:-:-:0 @P6 IADD pred_bits, pred_bits, 1; + +--:-:-:-:5 CAL IMAGE_OFFSET; +--:-:-:-:5 BRA.U IMAGE_LOOP; + } : q{ + + +06:-:-:-:1 FADD Y0X0, y0x0, -y2x0; +--:-:-:-:1 FADD Y0X1, y0x1, -y2x1; +--:-:-:-:1 FADD Y0X2, y0x2, -y2x2; +--:-:-:-:1 FADD Y0X3, y0x3, -y2x3; +--:-:-:-:1 FADD I00, Y0X0, -Y0X2; +--:-:-:-:1 FADD I03, -Y0X1, Y0X3; +--:-:-:-:1 FADD I01, Y0X1, Y0X2; +--:-:-:-:1 FADD I02, Y0X2, -Y0X1; +--:-:-:-:1 STS [writeS + 4x<32*00>], I00; +--:-:-:-:1 STS [writeS + 4x<32*03>], I03; +--:-:-:-:1 STS [writeS + 4x<32*01>], I01; +--:6:-:-:1 STS [writeS + 4x<32*02>], I02; +18:-:-:-:1 FADD Y3X0, -y1x0, y3x0; +--:-:-:-:1 FADD Y3X1, -y1x1, y3x1; +--:-:-:-:1 FADD Y3X2, -y1x2, y3x2; +--:-:-:-:1 FADD Y3X3, -y1x3, y3x3; +--:-:-:-:1 FADD I12, Y3X0, -Y3X2; +--:-:-:-:1 FADD I15, -Y3X1, Y3X3; +--:-:-:-:1 FADD I13, Y3X1, Y3X2; +--:-:-:-:1 FADD I14, Y3X2, -Y3X1; +--:-:-:-:1 STS [writeS + 4x<32*12>], I12; +--:-:-:-:1 STS [writeS + 4x<32*15>], I15; +--:-:-:-:1 STS [writeS + 4x<32*13>], I13; +--:-:-:-:1 STS [writeS + 4x<32*14>], I14; +20:-:-:-:1 FADD Y1X0, y1x0, y2x0; +--:-:-:-:1 FADD Y1X1, y1x1, y2x1; +--:-:-:-:1 FADD Y1X2, y1x2, y2x2; +--:-:-:-:1 FADD Y1X3, y1x3, y2x3; +--:-:-:-:1 FADD Y2X0, y2x0, -y1x0; +--:-:-:-:1 FADD Y2X1, y2x1, -y1x1; +--:-:-:-:1 FADD Y2X2, y2x2, -y1x2; +--:-:-:-:1 FADD Y2X3, y2x3, -y1x3; +--:-:-:-:1 FADD I04, Y1X0, -Y1X2; +--:-:-:-:1 FADD I05, Y1X1, Y1X2; +--:-:-:-:1 FADD I06, Y1X2, -Y1X1; +--:-:-:-:1 FADD I07, -Y1X1, Y1X3; +--:-:-:-:1 STS [writeS + 4x<32*04>], I04; +--:-:-:-:1 STS [writeS + 4x<32*05>], I05; +--:-:-:-:1 STS [writeS + 4x<32*06>], I06; +--:-:-:-:1 STS [writeS + 4x<32*07>], I07; +--:-:-:-:1 FADD I08, Y2X0, -Y2X2; +--:-:-:-:1 FADD I11, -Y2X1, Y2X3; +--:-:-:-:1 FADD I09, Y2X1, Y2X2; +--:-:-:-:1 FADD I10, Y2X2, -Y2X1; +--:-:-:-:1 STS [writeS + 4x<32*08>], I08; +--:-:-:-:1 STS [writeS + 4x<32*11>], I11; +--:-:-:-:1 STS [writeS + 4x<32*09>], I09; +--:-:-:-:1 STS [writeS + 4x<32*10>], I10; + + + +// init = bNextY ? 1 : 0 +--:-:-:-:0 SEL init, RZ, 1, !P6; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 IADD writeS, writeS, 4x<(512*4 + 32)*2>; + +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>]; + +--:-:-:-:5 CAL IMAGE_LOAD; + +// init += bNextY ? 1 : 0 +--:-:-:-:0 @P6 IADD init, init, 1; +--:-:-:-:5 CAL IMAGE_OFFSET; +--:-:-:-:0 BFI pred_bits, init, 0x214, pred_bits; // 2 bits at position 20 +--:-:-:-:5 BRA.U IMAGE_LOOP; + }; ++] + + +IMAGE_OFFSET: + + +[+ + our ($dtype_shift, $IX); + return $IX ? qq{ + +--:-:-:-:1 BFE.U32 super_x, tid, param_superXI; +--:-:-:-:1 BFE.U32 super_y, tid, param_superYI; +--:-:-:-:1 SHL gx, gxs, param_shiftXI; +--:-:-:-:1 SHL gy, gys, param_shiftYI; +--:-:-:-:1 IADD gx, gx, super_x; +--:-:-:-:1 IADD gy, gy, super_y; + +--:-:-:-:1 ISETP.LT.AND P0, PT, gx, param_GX, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, gy, param_GY, P0; + +// offset = blkC*GY*GX*N + gy*GX*N + gx*N + n +--:-:-:-:1 XMAD.U16.U16 offset, gx, param_N, n; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, gy, param_XN, offset; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, blkC, param_YXN, offset; + +// trackI = offsetI + offset*512 +20:-:-:-:1 LEA trackI0.CC, offset, offsetI0, 1x<$dtype_shift + 9>; +--:-:-:-:0 LEA.HI.X trackI1, offset, offsetI1, RZ, 1x<$dtype_shift + 9>; + } : qq{ +// Calc superblock coordinates +01:-:-:-:1 SHL x, gxs, param_shiftX; +--:-:-:-:1 SHL y, gys, param_shiftY; + +// Calc this thread's sub-block coordinates +--:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 super_y, tid, param_superY; +--:-:-:-:1 ISCADD x, super_x, x, 1; +--:-:-:-:1 ISCADD y, super_y, y, 1; + +// Apply padding +--:-:-:-:1 IADD x, x, -param_pad_x; +--:-:-:-:1 IADD y, y, -param_pad_y; + +// c = blkC*32 + tidX +--:-:-:-:1 BFE.U32 tid_X, tid, 0x502; // 5 bits at position 2 +--:-:-:-:1 ISCADD c, blkC, tid_X, 5; +--:-:-:-:1 ISETP.LT.AND P4, PT, c, param_C, P4; + +// offset = c*YXN + y*XN + x*N + n +--:-:-:-:1 XMAD.S16.U16 offset, x, param_N, n; +--:-:-:-:1 XMAD.S16.U16.LO2C offset, y, param_XN, offset; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, c, param_YXN, offset; +--:-:-:-:1 ISET.LT.AND offsign, offset, RZ, PT; + +20:-:-:-:1 LEA track00.CC, offset, param_I[0], $dtype_shift; +--:-:-:-:1 IADD.X track01, offsign, param_I[1]; +--:-:-:-:1 IADD track10.CC, track00, param_Np; +--:-:-:-:1 IADD.X track11, track01, RZ; +--:-:-:-:1 IADD track20.CC, track10, param_Np; +--:-:-:-:1 IADD.X track21, track11, RZ; +--:-:-:-:1 IADD track30.CC, track20, param_Np; +--:-:-:-:1 IADD.X track31, track21, RZ; + +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 IADD x2, x, 2; +--:-:-:-:1 IADD x3, x, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_X, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_X, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_X, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_X, P4; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x0f; + +--:-:-:-:1 IADD x1, y, 1; +--:-:-:-:1 IADD x2, y, 2; +--:-:-:-:1 IADD x3, y, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, y, param_Y, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_Y, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_Y, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_Y, P4; +--:-:-:-:1 ISETP.GE.AND P0, PT, y, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; + +--:-:-:-:1 SEL pred_bits, mask_x, RZ, P0; +--:-:-:-:1 \@P1 BFI pred_bits, mask_x, 0x404, pred_bits; +--:-:-:-:1 \@P2 BFI pred_bits, mask_x, 0x408, pred_bits; +--:-:-:-:1 \@P3 BFI pred_bits, mask_x, 0x40c, pred_bits; + +// Cache y preds in high bits +--:-:-:-:1 P2R mask_y, PR, RZ, 0x0f; +--:-:-:-:0 BFI pred_bits, mask_y, 0x410, pred_bits; // 4 bits at position 16 + }; ++] + + +--:-:-:-:5 RET; + +IMAGE_LOAD: + + + +[+ + our ($dtype, $dtype_shift, $IX, $vec_size, $dtype_size); + return $IX ? qq{ + +--:-:2:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero]; +--:-:3:-:1 \@!P0 LDS.U.$vec_size I1, [addr_zero]; +--:-:4:-:1 \@!P0 LDS.U.$vec_size I2, [addr_zero]; +--:-:5:-:1 \@!P0 LDS.U.$vec_size I3, [addr_zero]; + +--:-:2:-:1 \@P0 LDG.E.CG.$vec_size I0, [trackI + 4x<00 * $dtype_size>]; +--:-:3:-:1 \@P0 LDG.E.CG.$vec_size I1, [trackI + 4x<32 * $dtype_size>]; +--:-:4:-:1 \@P0 LDG.E.CG.$vec_size I2, [trackI + 4x<64 * $dtype_size>]; +--:6:5:-:1 \@P0 LDG.E.CG.$vec_size I3, [trackI + 4x<96 * $dtype_size>]; + + } : qq{ +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 SHF.R.U64 pred_bits, pred_bits, 8, pred_bits; + +--:-:-:-:1 \@!P0 MOV y0x0, RZ; +--:-:-:-:1 \@P0 LDG.E.CI$dtype y0x0, [track0]; +--:-:-:-:1 \@!P1 MOV y0x1, RZ; +--:-:-:-:1 \@P1 LDG.E.CI$dtype y0x1, [track1]; +--:-:-:-:1 \@!P2 MOV y0x2, RZ; +--:-:-:-:1 \@P2 LDG.E.CI$dtype y0x2, [track2]; +--:-:-:-:1 \@!P3 MOV y0x3, RZ; +--:6:2:-:1 \@P3 LDG.E.CI$dtype y0x3, [track3]; +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 SHF.L.U64 pred_bits, pred_bits, 4, pred_bits; +20:-:-:-:1 IADD track00.CC, track00, param_2XNp; +--:-:-:-:1 IADD.X track01, track01, RZ; +--:-:-:-:1 IADD track10.CC, track10, param_2XNp; +--:-:-:-:1 IADD.X track11, track11, RZ; +--:-:-:-:1 IADD track20.CC, track20, param_2XNp; +--:-:-:-:1 IADD.X track21, track21, RZ; +--:-:-:-:1 IADD track30.CC, track30, param_2XNp; +--:-:-:-:1 IADD.X track31, track31, RZ; + +--:-:-:-:1 \@!P0 MOV y2x0, RZ; +--:-:-:-:1 \@P0 LDG.E.CI$dtype y2x0, [track0]; +--:-:-:-:1 \@!P1 MOV y2x1, RZ; +--:-:-:-:1 \@P1 LDG.E.CI$dtype y2x1, [track1]; +--:-:-:-:1 \@!P2 MOV y2x2, RZ; +--:-:-:-:1 \@P2 LDG.E.CI$dtype y2x2, [track2]; +--:-:-:-:1 \@!P3 MOV y2x3, RZ; +--:6:3:-:1 \@P3 LDG.E.CI$dtype y2x3, [track3]; +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 SHF.R.U64 pred_bits, pred_bits, 8, pred_bits; +20:-:-:-:1 IADD track00.CC, track00, -param_XNp; +--:-:-:-:1 IADD.X track01, track01, -RZ; +--:-:-:-:1 IADD track10.CC, track10, -param_XNp; +--:-:-:-:1 IADD.X track11, track11, -RZ; +--:-:-:-:1 IADD track20.CC, track20, -param_XNp; +--:-:-:-:1 IADD.X track21, track21, -RZ; +--:-:-:-:1 IADD track30.CC, track30, -param_XNp; +--:-:-:-:1 IADD.X track31, track31, -RZ; + +--:-:-:-:1 \@!P0 MOV y1x0, RZ; +--:-:-:-:1 \@P0 LDG.E.CI$dtype y1x0, [track0]; +--:-:-:-:1 \@!P1 MOV y1x1, RZ; +--:-:-:-:1 \@P1 LDG.E.CI$dtype y1x1, [track1]; +--:-:-:-:1 \@!P2 MOV y1x2, RZ; +--:-:-:-:1 \@P2 LDG.E.CI$dtype y1x2, [track2]; +--:-:-:-:1 \@!P3 MOV y1x3, RZ; +--:6:4:-:1 \@P3 LDG.E.CI$dtype y1x3, [track3]; +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 SHF.L.U64 pred_bits, pred_bits, 12, pred_bits; +20:-:-:-:1 IADD track00.CC, track00, param_2XNp; +--:-:-:-:1 IADD.X track01, track01, RZ; +--:-:-:-:1 IADD track10.CC, track10, param_2XNp; +--:-:-:-:1 IADD.X track11, track11, RZ; +--:-:-:-:1 IADD track20.CC, track20, param_2XNp; +--:-:-:-:1 IADD.X track21, track21, RZ; +--:-:-:-:1 IADD track30.CC, track30, param_2XNp; +--:-:-:-:1 IADD.X track31, track31, RZ; + +--:-:-:-:1 \@!P0 MOV y3x0, RZ; +--:-:-:-:1 \@P0 LDG.E.CI$dtype y3x0, [track0]; +--:-:-:-:1 \@!P1 MOV y3x1, RZ; +--:-:-:-:1 \@P1 LDG.E.CI$dtype y3x1, [track1]; +--:-:-:-:1 \@!P2 MOV y3x2, RZ; +--:-:-:-:1 \@P2 LDG.E.CI$dtype y3x2, [track2]; +--:-:-:-:1 \@!P3 MOV y3x3, RZ; +--:6:5:-:1 \@P3 LDG.E.CI$dtype y3x3, [track3]; + }; ++] + + +// Advance offset/preds +--:-:-:-:1 IADD n, n, param_loopN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +--:-:-:-:1 @!P4 BFE.U32 n, tid, param_superNI; +--:-:-:-:1 @!P4 IADD gxs, gxs, param_strideX; + +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, PT; + +--:-:-:-:1 @!P5 MOV gxs, blk_Q; +--:-:-:-:1 @!P5 IADD gys, gys, param_strideY; + +--:-:-:-:1 ISETP.LT.AND P6, PT, gys, param_GYS, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; + + +--:-:-:-:5 RET; + +ERROR_SETUP: + +[+ + our $IX; + return $IX ? q{ + +--:-:-:-:1 BFE.U32 n, tid, param_superN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +// tidX = (tid & 127) >> 2 +// tidY = tid & 3 +// writeS = tidY*512 + tidX + (tidY << 3) +--:-:-:-:1 BFE.U32 tidX, tid, 0x502; // 5 bits at position 2 +--:-:-:-:1 LOP.AND tidY, tid, 3; +--:-:-:-:1 ISCADD writeS, tidY, tidX, 9; +--:-:-:-:1 ISCADD writeS, tidY, writeS, 3; +--:-:-:-:1 SHL writeS, writeS, 2; + + } : ''; ++] + +--:-:-:-:0 MOV blkK, blk_K; + +--:-:-:-:5 CAL ERROR_OFFSET; +--:-:-:-:5 CAL ERROR_LOAD; +--:-:-:-:5 CAL ERROR_OFFSET; + + +[+ + our ($convert_in); + return $convert_in ? qq{ + +02:-:2:-:1 $convert_in p0q0, p0q0; +04:-:3:-:1 $convert_in p0q1, p0q1; +08:-:4:-:1 $convert_in p1q1, p1q1; +10:-:5:-:1 $convert_in p1q0, p1q0; + + } : ''; ++] + + +02:-:-:-:1 FMUL e0, p0q0, 0.5; +04:-:-:-:1 FFMA E01, p0q1, 0.5, e0; +--:-:-:-:1 FFMA E02, p0q1, -0.5, e0; +08:-:-:-:1 FMUL e1, p1q1, 0.5; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*00 + 32>], E00; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*01 + 32>], E01; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*02 + 32>], E02; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*03 + 32>], E03; +10:-:-:-:1 FFMA E13, p1q0, 0.5, e1; +--:-:-:-:1 FFMA E14, p1q0, 0.5, -e1; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*12 + 32>], E12; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*15 + 32>], E15; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*13 + 32>], E13; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*14 + 32>], E14; +--:-:-:-:1 FFMA B0, p1q0, 0.5, e0; +--:-:-:-:1 FFMA C0, p1q0, -0.5, e0; +--:-:-:-:1 FFMA B1, p0q1, 0.5, e1; +--:-:-:-:1 FFMA C1, p0q1, 0.5, -e1; +--:-:-:-:1 FMUL e2, B0, 0.5; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*04 + 32>], E04; +--:-:-:-:1 FMUL e3, C0, 0.5; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*08 + 32>], E08; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*07 + 32>], E07; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*11 + 32>], E11; +--:-:-:-:1 FFMA E05, B1, 0.5, e2; +--:-:-:-:1 FFMA E06, B1, -0.5, e2; +--:-:-:-:1 FFMA E09, C1, 0.5, e3; +--:-:-:-:1 FFMA E10, C1, -0.5, e3; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*05 + 32>], E05; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*06 + 32>], E06; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*09 + 32>], E09; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*10 + 32>], E10; + + + + +// init = bNextY ? 1 : 0 +--:-:-:-:0 SEL init, RZ, 1, !P6; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 IADD writeS, writeS, 4x<(512*4 + 32)*2>; + +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>]; + +--:-:-:-:5 CAL ERROR_LOAD; + +// init += bNextY ? 1 : 0 +--:-:-:-:0 @P6 IADD init, init, 1; +--:-:-:-:5 CAL ERROR_OFFSET; +--:-:-:-:0 BFI pred_bits, init, 0x208, pred_bits; // 2 bits at position 8 +--:-:-:-:5 BRA.U ERROR_LOOP; + +ERROR_OFFSET: + + +// Calc superblock coordinates +01:-:-:-:1 SHL x, gxs, param_shiftX; +--:-:-:-:1 SHL y, gys, param_shiftY; + +// Calc this thread's sub-block coordinates +--:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 super_y, tid, param_superY; +--:-:-:-:1 ISCADD x, super_x, x, 1; +--:-:-:-:1 ISCADD y, super_y, y, 1; + +// k = blkK*32 + tidX (have k share register with c) +--:-:-:-:1 BFE.U32 tid_X, tid, 0x502; // 5 bits at position 2 +--:-:-:-:1 ISCADD c, blkK, tid_X, 5; +--:-:-:-:1 ISETP.LT.AND P4, PT, c, param_K, P4; + +// offset0 = k*PQN + y*QN + x*N + n +// offset1 = offset0 + N +// offset2 = offset0 + QN +// offset3 = offset1 + QN +--:-:-:-:1 XMAD.S16.U16 offset, x, param_N, n; +--:-:-:-:1 XMAD.S16.U16.LO2C offset, y, param_QN, offset; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, c, param_PQN, offset; + +20:-:-:-:1 LEA track00.CC, offset, param_E[0], [+ dtype_shift() +]; +--:-:-:-:1 IADD.X track01, RZ, param_E[1]; +--:-:-:-:1 IADD track10.CC, track00, param_Np; +--:-:-:-:1 IADD.X track11, track01, RZ; +--:-:-:-:1 IADD track20.CC, track00, param_QNp; +--:-:-:-:1 IADD.X track21, track01, RZ; +--:-:-:-:1 IADD track30.CC, track10, param_QNp; +--:-:-:-:0 IADD.X track31, track11, RZ; + +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 IADD x2, y, 1; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, y, param_P, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, x2, param_P, P4; + +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, y, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x2, RZ, P3; + +--:-:-:-:1 P2R mask_x, PR, RZ, 0x03; +--:-:-:-:1 P2R mask_y, PR, RZ, 0x0c; + +--:-:-:-:1 SEL pred_bits, mask_x, RZ, P2; +--:-:-:-:1 @P3 BFI pred_bits, mask_x, 0x202, pred_bits; // 2 bits at position 2 + +// Cache y preds in high bits +--:-:-:-:0 BFI pred_bits, mask_y, 0x404, pred_bits; // 4 bits at position 4 + + +--:-:-:-:5 RET; + +ERROR_LOAD: + + + +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 @!P0 MOV p0q0, RZ; +--:-:2:-:1 @P0 LDG.E.CI[+ dtype() +] p0q0, [track0]; +--:-:-:-:1 @!P1 MOV p0q1, RZ; +--:-:3:-:1 @P1 LDG.E.CI[+ dtype() +] p0q1, [track1]; +--:-:-:-:1 @!P3 MOV p1q1, RZ; +--:-:4:-:1 @P3 LDG.E.CI[+ dtype() +] p1q1, [track3]; +--:-:-:-:1 @!P2 MOV p1q0, RZ; +--:6:5:-:1 @P2 LDG.E.CI[+ dtype() +] p1q0, [track2]; + + + +// Advance offset/preds +--:-:-:-:1 IADD n, n, param_loopN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +--:-:-:-:1 @!P4 BFE.U32 n, tid, param_superN; +--:-:-:-:1 @!P4 IADD gxs, gxs, param_strideX; + +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, PT; + +--:-:-:-:1 @!P5 MOV gxs, blk_Q; +--:-:-:-:1 @!P5 IADD gys, gys, param_strideY; + +--:-:-:-:1 ISETP.LT.AND P6, PT, gys, param_GYS, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; + + +--:-:-:-:5 RET; + + +IMAGE_LOOP: + +[+ + our ($dtype, $dtype_shift, $dtype_size, $vec_size, $convert_in, $IX); + my %insert = ( + + $IX ? ( + + j0c8 => "--:-:-:-:1 ISETP.LT.AND P0, PT, gx, param_GX, P6;\n", + j0c20 => "--:-:-:-:1 ISETP.LT.AND P0, PT, gy, param_GY, P0;\n", + + j1c10 => "20:-:-:-:1 \@P0 LEA trackI0.CC, offset, offsetI0, 1x<$dtype_shift + 9>;\n", + j1c15 => "--:-:-:-:1 \@P0 LEA.HI.X trackI1, offset, offsetI1, RZ, 1x<$dtype_shift + 9>;\n", + + j1c32 => "02:2:-:-:1 STS.128 [writeS + 4x<00*4>], I0;\n", + j1c36 => "02:-:2:-:1 \@P0 LDG.E.CG.$vec_size I0, [trackI + 4x<00 * $dtype_size>];\n", + j1c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];\n", + + j1c56 => "04:3:-:-:1 STS.128 [writeS + 4x<32*4>], I1;\n", + j1c60 => "04:-:3:-:1 \@P0 LDG.E.CG.$vec_size I1, [trackI + 4x<32 * $dtype_size>];\n", + j1c62 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I1, [addr_zero];\n", + + + j2c32 => "08:4:-:-:1 STS.128 [writeS + 4x<64*4>], I2;\n", + j2c36 => "08:-:4:-:1 \@P0 LDG.E.CG.$vec_size I2, [trackI + 4x<64 * $dtype_size>];\n", + j2c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I2, [addr_zero];\n", + + + j2c56 => "10:5:-:-:1 STS.128 [writeS + 4x<96*4>], I3;\n", + j2c60 => "10:6:5:-:1 \@P0 LDG.E.CG.$vec_size I3, [trackI + 4x<96 * $dtype_size>];\n", + j2c62 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I3, [addr_zero];\n", + + $convert_in ? ( + j1c16 => "02:-:-:-:1 $convert_in I03, I01.H1;\n", + j1c20 => "--:-:-:-:1 $convert_in I02, I01.H0;\n", + j1c24 => "--:-:-:-:1 $convert_in I01, I00.H1;\n", + j1c28 => "--:-:2:-:1 $convert_in I00, I00.H0;\n", + + j1c40 => "04:-:-:-:1 $convert_in I13, I11.H1;\n", + j1c44 => "--:-:-:-:1 $convert_in I12, I11.H0;\n", + j1c48 => "--:-:-:-:1 $convert_in I11, I10.H1;\n", + j1c52 => "--:-:3:-:1 $convert_in I10, I10.H0;\n", + + j2c16 => "08:-:-:-:1 $convert_in I23, I21.H1;\n", + j2c20 => "--:-:-:-:1 $convert_in I22, I21.H0;\n", + j2c24 => "--:-:-:-:1 $convert_in I21, I20.H1;\n", + j2c28 => "--:-:4:-:1 $convert_in I20, I20.H0;\n", + + j2c40 => "10:-:-:-:1 $convert_in I33, I31.H1;\n", + j2c44 => "--:-:-:-:1 $convert_in I32, I31.H0;\n", + j2c48 => "--:-:-:-:1 $convert_in I31, I30.H1;\n", + j2c52 => "--:-:5:-:1 $convert_in I30, I30.H0;\n", + ) : (), + + j2c63 => "--:-:-:-:1 IADD n, n, param_loopN;\n" . + "--:-:-:-:0 IADD offset, offset, param_loopN;\n". + "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuffer;\n" . + "--:-:-:-:1 IADD readEs, readEs, -swapBuffer;\n" . + "--:-:-:-:1 IADD writeS, writeS, swapBuffer;\n" . + "--:-:-:-:1 IADD swapBuffer, RZ, -swapBuffer;\n", + + j3c8 => "--:-:-:-:1 PSETP.OR.AND P4, PT, P5, P6, PT;\n", + j3c21 => "--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, P4;\n", + + j3c34 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n", + + j3c63 => "--:-:-:Y:5 \@P4 BRA.U IMAGE_LOOP;\n", + + ) : ( + + $convert_in ? ( + j0c37 => "02:-:-:-:1 $convert_in y0x0, y0x0;\n", + j0c41 => "--:-:-:-:1 $convert_in y0x1, y0x1;\n", + j0c45 => "--:-:-:-:1 $convert_in y0x2, y0x2;\n", + j0c49 => "--:-:2:-:1 $convert_in y0x3, y0x3;\n", + + j0c53 => "04:-:-:-:1 $convert_in y2x0, y2x0;\n", + j0c57 => "--:-:-:-:1 $convert_in y2x1, y2x1;\n", + j0c61 => "--:-:-:-:1 $convert_in y2x2, y2x2;\n", + j1c1 => "--:-:3:-:1 $convert_in y2x3, y2x3;\n", + + j1c5 => "08:-:-:-:1 $convert_in y1x0, y1x0;\n", + j1c10 => "--:-:-:-:1 $convert_in y1x1, y1x1;\n", + j1c14 => "--:-:-:-:1 $convert_in y1x2, y1x2;\n", + j1c16 => "--:-:4:-:1 $convert_in y1x3, y1x3;\n", + + j1c21 => "10:-:-:-:1 $convert_in y3x0, y3x0;\n", + j1c23 => "--:-:-:-:1 $convert_in y3x1, y3x1;\n", + j1c27 => "--:-:-:-:1 $convert_in y3x2, y3x2;\n", + j1c29 => "--:-:5:-:1 $convert_in y3x3, y3x3;\n", + ) : (), + + j1c22 => "06:-:-:-:1 FADD Y0X0, y0x0, -y2x0;\n" . + "--:-:-:-:1 FADD Y0X1, y0x1, -y2x1;\n", + + j1c24 => "--:-:-:-:1 FADD Y0X2, y0x2, -y2x2;\n" . + "--:-:-:-:1 FADD Y0X3, y0x3, -y2x3;\n", + + j1c28 => "--:-:-:-:1 FADD I00, Y0X0, -Y0X2;\n" . + "--:-:-:-:1 FADD I03, -Y0X1, Y0X3;\n", + j1c30 => "--:-:-:-:1 FADD I01, Y0X1, Y0X2;\n" . + "--:-:-:-:1 FADD I02, Y0X2, -Y0X1;\n", + + j1c31 => "--:-:-:-:1 STS [writeS + 4x<32*00>], I00;\n", + j1c33 => "--:-:-:-:1 STS [writeS + 4x<32*03>], I03;\n", + j1c35 => "--:-:-:-:1 STS [writeS + 4x<32*01>], I01;\n", + j1c37 => "--:2:-:-:1 STS [writeS + 4x<32*02>], I02;\n", + + j1c39 => "18:-:-:-:1 FADD Y3X0, -y1x0, y3x0;\n" . + "--:-:-:-:1 FADD Y3X1, -y1x1, y3x1;\n" . + "--:-:-:-:1 FADD Y3X2, -y1x2, y3x2;\n" . + "--:-:-:-:1 FADD Y3X3, -y1x3, y3x3;\n", + + j1c43 => "--:-:-:-:1 FADD I12, Y3X0, -Y3X2;\n" . + "--:-:-:-:1 FADD I15, -Y3X1, Y3X3;\n" . + "--:-:-:-:1 FADD I13, Y3X1, Y3X2;\n" . + "--:-:-:-:1 FADD I14, Y3X2, -Y3X1;\n", + + j1c44 => "--:-:-:-:1 STS [writeS + 4x<32*12>], I12;\n", + j1c46 => "--:-:-:-:1 STS [writeS + 4x<32*15>], I15;\n", + j1c48 => "--:-:-:-:1 STS [writeS + 4x<32*13>], I13;\n", + j1c50 => "--:-:-:-:1 STS [writeS + 4x<32*14>], I14;\n", + + j1c52 => "--:-:-:-:1 R2P PR, pred_bits, 0x0f;\n" . + "--:-:-:-:1 SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;\n", + + j1c53 => "--:-:-:-:1 \@P6 ISET.LT.AND off_sign, offset, RZ, PT;\n" . + "--:-:-:-:1 \@P6 LEA track00.CC, offset, param_I[0], $dtype_shift;\n", + + j1c58 => "--:-:-:-:1 \@P6 IADD.X track01, off_sign, param_I[1];\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track00, param_Np;\n", + + j2c18 => "--:-:-:-:1 FADD Y1X0, y1x0, y2x0;\n" . + "--:-:-:-:1 FADD Y1X1, y1x1, y2x1;\n" . + "--:-:-:-:1 FADD Y1X2, y1x2, y2x2;\n" . + "--:-:-:-:1 FADD Y1X3, y1x3, y2x3;\n" . + "--:-:-:-:1 FADD Y2X0, y2x0, -y1x0;\n" . + "--:-:-:-:1 FADD Y2X1, y2x1, -y1x1;\n" . + "--:-:-:-:1 FADD Y2X2, y2x2, -y1x2;\n" . + "--:-:-:-:1 FADD Y2X3, y2x3, -y1x3;\n" . + "--:-:-:-:1 FADD I04, Y1X0, -Y1X2;\n" . + "--:-:-:-:1 FADD I05, Y1X1, Y1X2;\n" . + "--:-:-:-:1 FADD I06, Y1X2, -Y1X1;\n" . + "--:-:-:-:1 FADD I07, -Y1X1, Y1X3;\n", + + j2c19 => "--:-:-:-:1 STS [writeS + 4x<32*04>], I04;\n", + j2c21 => "--:-:-:-:1 STS [writeS + 4x<32*05>], I05;\n", + j2c23 => "--:-:-:-:1 STS [writeS + 4x<32*06>], I06;\n", + j2c25 => "--:-:-:-:1 STS [writeS + 4x<32*07>], I07;\n", + + j2c27 => "--:-:-:-:1 \@P6 IADD.X track11, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track10, param_Np;\n", + + j2c31 => "--:-:-:-:1 FADD I08, Y2X0, -Y2X2;\n" . + "--:-:-:-:1 FADD I11, -Y2X1, Y2X3;\n" . + "--:-:-:-:1 FADD I09, Y2X1, Y2X2;\n" . + "--:-:-:-:1 FADD I10, Y2X2, -Y2X1;\n", + + j2c32 => "--:-:-:-:1 STS [writeS + 4x<32*08>], I08;\n", + j2c34 => "--:-:-:-:1 STS [writeS + 4x<32*11>], I11;\n", + j2c36 => "--:-:-:-:1 STS [writeS + 4x<32*09>], I09;\n", + j2c38 => "--:-:-:-:1 STS [writeS + 4x<32*10>], I10;\n", + + j2c40 => "--:-:-:-:1 \@P6 IADD.X track21, track11, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track20, param_Np;\n", + + j2c44 => "--:-:-:-:1 LOP.AND.NZ P4, RZ, pred_bits, 0x4000;\n" . + "--:-:-:-:1 LOP.XOR pred_bits, pred_bits, 0x4000;\n", + + j2c46 => "--:-:-:-:1 \@P6 IADD.X track31, track21, RZ;\n" . + "--:-:-:-:1 IADD n, n, param_loopN;\n" . + "--:-:-:-:1 IADD offset, offset, param_loopN;\n", + + j2c62 => "--:-:-:-:1 \@P4 MOV swapBuf, 4x<(512*4 + 32)*2>;\n" . + "--:-:-:-:1 \@!P4 MOV swapBuf, -4x<(512*4 + 32)*2>;\n", + + j2c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@!P0 I2I.U32.U32 y0x0, RZ;\n" . + "--:-:-:-:0 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 LDG.E.CI$dtype y0x0, [track0];\n" . + "--:-:-:-:0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@!P1 I2I.U32.U32 y0x1, RZ;\n" . + "--:-:-:-:1 \@P1 LDG.E.CI$dtype y0x1, [track1];\n", + + j3c0 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y0x2, RZ;\n", + j3c1 => "--:-:-:-:1 \@P2 LDG.E.CI$dtype y0x2, [track2];\n", + j3c2 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y0x3, RZ;\n", + j3c3 => "--:6:2:-:1 \@P3 LDG.E.CI$dtype y0x3, [track3];\n" . + "--:-:-:Y:8 R2P PR, pred_bits, 0x0f;\n" . + "20:-:-:-:1 \@P6 IADD track00.CC, track00, param_2XNp;\n" . + "--:-:-:-:1 SHF.L.U64 pred_bits, pred_bits, 4, pred_bits;\n", + + j3c7 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y2x0, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track01, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track10, param_2XNp;\n", + + j3c9 => "--:-:-:-:1 PSETP.OR.AND P4, PT, P5, P6, PT;\n", + + j3c11 => "--:-:-:-:1 \@P0 LDG.E.CI$dtype y2x0, [track0];\n" . + "--:-:-:-:0 \@P6 IADD.X track11, track11, RZ;\n" . + "--:-:-:-:1 \@!P1 I2I.U32.U32 y2x1, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track20, param_2XNp;\n", + + j3c12 => "--:-:-:-:1 \@P1 LDG.E.CI$dtype y2x1, [track1];\n", + + j3c16 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y2x2, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track21, track21, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track30, param_2XNp;\n", + + j3c17 => "--:-:-:-:1 \@P2 LDG.E.CI$dtype y2x2, [track2];\n", + + + j3c21 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y2x3, RZ;\n" . + "--:-:-:-:2 \@P6 IADD.X track31, track31, RZ;\n", + + j3c22 => "--:6:3:-:1 \@P3 LDG.E.CI$dtype y2x3, [track3];\n" . + "--:-:-:Y:8 R2P PR, pred_bits, 0x0f;\n" . + "20:-:-:-:1 \@P6 IADD track00.CC, track00, -param_XNp;\n" . + "--:-:-:-:1 SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;\n", + + j3c23 => "--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, P4;\n", + + + j3c25 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y1x0, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track01, track01, -RZ;\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track10, -param_XNp;\n", + + j3c26 => "--:-:-:-:1 \@P0 LDG.E.CI$dtype y1x0, [track0];\n", + + j3c30 => "--:-:-:-:1 \@!P1 I2I.U32.U32 y1x1, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track11, track11, -RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track20, -param_XNp;\n", + + j3c31 => "--:-:-:-:1 \@P1 LDG.E.CI$dtype y1x1, [track1];\n", + + j3c33 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n", + + j3c35 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y1x2, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track21, track21, -RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track30, -param_XNp;\n", + + j3c36 => "--:-:-:-:1 \@P2 LDG.E.CI$dtype y1x2, [track2];\n", + + j3c40 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y1x3, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track31, track31, -RZ;\n", + + j3c42 => "--:6:4:-:1 \@P3 LDG.E.CI$dtype y1x3, [track3];\n" . + "--:-:-:Y:8 R2P PR, pred_bits, 0x0f;\n" . + "20:-:-:-:1 \@P6 IADD track00.CC, track00, param_2XNp;\n" . + "--:-:-:-:1 SHF.L.U64 pred_bits, pred_bits, 12, pred_bits;\n", + + j3c46 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y3x0, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track01, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track10, param_2XNp;\n", + + j3c47 => "--:-:-:-:1 \@P0 LDG.E.CI$dtype y3x0, [track0];\n", + + j3c51 => "--:-:-:-:1 \@!P1 I2I.U32.U32 y3x1, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track11, track11, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track20, param_2XNp;\n", + + j3c52 => "--:-:-:-:1 \@P1 LDG.E.CI$dtype y3x1, [track1];\n", + + j3c56 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y3x2, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track21, track21, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track30, param_2XNp;\n", + + j3c57 => "--:-:-:-:1 \@P2 LDG.E.CI$dtype y3x2, [track2];\n", + + j3c60 => "--:-:-:-:2 \@!P3 I2I.U32.U32 y3x3, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track31, track31, RZ;\n", + + j3c62 => "--:6:5:-:1 \@P3 LDG.E.CI$dtype y3x3, [track3];\n", + + j3c63 => "--:-:-:Y:5 \@P4 BRA.U IMAGE_LOOP;\n", + ) + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 3) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 4; + my $bankOffset = $IX ? 0 : 8; + + my ($c0, $c2, $c4, $c6) = $j == 3 && !$IX ? (4,6,8,10) : (0,2,4,6); + + $insert{"j${j}c$c0"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy0, [readIs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, $bankOffset; + $insert{"j${j}c$c2"} = sprintf "--:-:-:-:1 LDS.U.128 j%dEx0, [readEs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, 8; + $insert{"j${j}c$c4"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy4, [readIs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, $bankOffset; + $insert{"j${j}c$c6"} = sprintf "--:-:1:-:1 LDS.U.128 j%dEx4, [readEs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, 8; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1; + + my $yield = $j < 3 && $stall && ($c % 3 == 0) ? 'Y' : '-'; + + my $wait = $c == 0 ? $j == 2 && !$IX ? '03' : '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] +[+ + our $IX; + return $IX ? q{ +// Advance x offset/preds + +--:-:-:-:1 IADD gxs, gxs, param_strideX; +--:-:-:-:1 IADD offset, offset, param_loopXI; + +01:-:-:-:1 BFE.U32 super_x, tid, param_superXI; +--:-:-:-:1 SHL gx, gxs, param_shiftXI; + +--:-:-:-:1 BFE.U32 n, tid, param_superNI; + +--:-:-:Y:d ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:0 IADD gx, gx, super_x; +--:-:-:Y:5 @P5 BRA.U IMAGE_LOOP; + +// Advance y offset/preds +--:-:-:-:1 IADD gys, gys, param_strideY; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; +--:-:-:-:1 LDS gxs, [addr_blk_Q]; +--:-:-:-:1 BFE.U32 super_x, tid, param_superXI; +--:-:-:-:1 PSETP.AND.AND P5, PT, PT, PT, PT; +--:-:-:-:0 BFE.U32 super_y, tid, param_superYI; +--:-:1:-:2 LDS blkC, [addr_blk_C]; +--:-:-:-:1 ISETP.LT.AND P6, PT, gys, param_GYS, PT; + +01:-:-:-:1 SHL gx, gxs, param_shiftXI; +--:-:-:-:1 SHL gy, gys, param_shiftYI; +--:-:-:-:1 IADD gx, gx, super_x; +--:-:-:-:1 IADD gy, gy, super_y; +--:-:-:-:1 XMAD.U16.U16 offset, gx, param_N, n; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, gy, param_XN, offset; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, blkC, param_YXN, offset; + +--:-:-:Y:5 @P6 BRA.U IMAGE_LOOP; + +// Set n to loop remaining times +--:-:-:-:1 LOP.AND.NZ P5, init, pred_bits, 3; +--:-:-:-:1 MOV nloop, param_loopN; +--:-:-:-:1 MOV N, param_N; +--:-:-:Y:a LOP.AND pred_bits, pred_bits, ~3; +--:-:-:-:0 VMAD.U16.U16 n, -init, nloop, N; +--:-:-:Y:5 @P5 BRA.U IMAGE_LOOP; +--:-:-:Y:5 BRA.U END_LOOP; + } : q{ +// Advance x offset/preds + +--:-:-:-:1 IADD gxs, gxs, param_strideX; +--:-:-:-:1 IADD offset, offset, param_loopX; +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:1 SHL x, gxs, param_shiftX; +01:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 n, tid, param_superN; +--:-:-:-:1 ISCADD x, super_x, x, 1; +--:-:-:-:1 IADD x, x, -param_pad_x; +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 IADD x2, x, 2; +--:-:-:-:1 IADD x3, x, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_X, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_X, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_X, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_X, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x0f; +// Extract y + init + buffer bits +--:-:-:-:1 BFE.U32 mask_y, pred_bits, 0x710; +--:-:-:-:1 R2P PR, mask_y, 0x0f; +--:-:-:-:1 SEL pred_bits, mask_x, RZ, P0; +--:-:-:-:1 @P1 BFI pred_bits, mask_x, 0x404, pred_bits; +--:-:-:-:1 @P2 BFI pred_bits, mask_x, 0x408, pred_bits; +--:-:-:-:1 @P3 BFI pred_bits, mask_x, 0x40c, pred_bits; +--:-:-:-:0 BFI pred_bits, mask_y, 0x710, pred_bits; + + +--:-:-:Y:5 @P5 BRA.U IMAGE_LOOP; + +// Advance y offset/preds +--:-:-:-:1 IADD gys, gys, param_strideY; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; +--:-:-:-:1 LDS gxs, [addr_blk_Q]; +--:-:-:-:0 BFE.U32 init, pred_bits, 0x314; +--:-:1:-:1 LDS blkC, [addr_blk_C]; +--:-:-:-:3 PSETP.AND.AND P5, PT, PT, PT, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, gys, param_GYS, PT; +--:-:-:-:5 CAL IMAGE_OFFSET; +--:-:-:-:0 BFI pred_bits, init, 0x314, pred_bits; +--:-:-:Y:5 @P6 BRA.U IMAGE_LOOP; + + +// Set n to loop remaining times +--:-:-:-:1 SHR.U32 pred_bits, init, 2; +--:-:-:-:1 MOV nloop, param_loopN; +--:-:-:-:1 MOV N, param_N; +--:-:-:Y:c LOP.AND.NZ P5, init, init, 3; +--:-:-:-:1 SHL pred_bits, pred_bits, 22; +--:-:-:-:0 VMAD.U16.U16 n, -init, nloop, N; +--:-:-:Y:5 @P5 BRA.U IMAGE_LOOP; +--:-:-:Y:5 BRA.U END_LOOP; + + }; ++] + + +ERROR_LOOP: + +[+ + our ($dtype, $convert_in, $dtype_shift, $IX); + my %insert = ( + + $convert_in ? ( + j1c13 => "02:-:2:-:1 $convert_in p0q0, p0q0;\n", + j1c17 => "04:-:3:-:1 $convert_in p0q1, p0q1;\n", + j1c21 => "08:-:4:-:1 $convert_in p1q1, p1q1;\n", + j1c25 => "10:-:5:-:1 $convert_in p1q0, p1q0;\n", + ) : (), + + j1c23 => "02:-:-:-:1 FMUL e0, p0q0, 0.5;\n", + + j1c28 => "04:-:-:-:1 FFMA E01, p0q1, 0.5, e0;\n" . + "--:-:-:-:1 FFMA E02, p0q1, -0.5, e0;\n", + + j1c29 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*00 + 32>], E00;\n", + j1c31 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*01 + 32>], E01;\n", + j1c33 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*02 + 32>], E02;\n", + j1c35 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*03 + 32>], E03;\n", + + j1c37 => "08:-:-:-:1 FMUL e1, p1q1, 0.5;\n", + + j1c42 => "10:-:-:-:1 FFMA E13, p1q0, 0.5, e1;\n" . + "--:-:-:-:1 FFMA E14, p1q0, 0.5, -e1;\n", + + j1c43 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*12 + 32>], E12;\n", + j1c45 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*15 + 32>], E15;\n", + j1c47 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*13 + 32>], E13;\n", + j1c49 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*14 + 32>], E14;\n", + + j1c51 => "--:-:-:-:1 FFMA B0, p1q0, 0.5, e0;\n" . + "--:-:-:-:1 FFMA C0, p1q0, -0.5, e0;\n" . + "--:-:-:-:1 FFMA B1, p0q1, 0.5, e1;\n" . + "--:-:-:-:1 FFMA C1, p0q1, 0.5, -e1;\n", + + j2c9 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*04 + 32>], E04;\n", + j2c11 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*08 + 32>], E08;\n", + j2c13 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*07 + 32>], E07;\n", + j2c15 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*11 + 32>], E11;\n", + + j2c17 => "--:-:-:-:1 FMUL e2, B0, 0.5;\n" . + "--:-:-:-:1 FMUL e3, C0, 0.5;\n", + + j2c21 => "--:-:-:-:1 FFMA E05, B1, 0.5, e2;\n" . + "--:-:-:-:1 FFMA E06, B1, -0.5, e2;\n" . + "--:-:-:-:1 FFMA E09, C1, 0.5, e3;\n" . + "--:-:-:-:1 FFMA E10, C1, -0.5, e3;\n", + + j2c23 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*05 + 32>], E05;\n", + j2c25 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*06 + 32>], E06;\n", + j2c27 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*09 + 32>], E09;\n", + j2c29 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*10 + 32>], E10;\n", + + j2c32 => "--:-:-:-:1 R2P PR, pred_bits, 0x0f;\n" . + "--:-:-:-:1 \@P6 LEA track00.CC, offset, param_E[0], $dtype_shift;\n", + + j2c37 => "--:-:-:-:1 \@P6 IADD.X track01, RZ, param_E[1];\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track00, param_Np;\n", + + j2c42 => "--:-:-:-:1 \@P6 IADD.X track11, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track00, param_QNp;\n", + + j2c44 => "--:-:-:-:1 LOP.AND.NZ P4, RZ, pred_bits, 0x400;\n" . + "--:-:-:-:1 LOP.XOR pred_bits, pred_bits, 0x400;\n", + + j2c47 => "--:-:-:-:1 \@P6 IADD.X track21, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track10, param_QNp;\n", + + j2c52 => "--:-:-:-:1 \@P6 IADD.X track31, track11, RZ;\n", + + j2c61 => "--:-:-:-:1 \@P4 MOV swapBuf, 4x<(512*4 + 32)*2>;\n" . + "--:-:-:-:1 \@!P4 MOV swapBuf, -4x<(512*4 + 32)*2>;\n", + + j2c62 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeS, writeS, swapBuf;\n", + + j3c8 => "--:-:2:-:1 \@P0 LDG.E.CI$dtype p0q0, [track0];\n", + j3c10 => "--:-:3:-:1 \@P1 LDG.E.CI$dtype p0q1, [track1];\n", + j3c12 => "--:-:4:-:1 \@P3 LDG.E.CI$dtype p1q1, [track3];\n", + j3c14 => "--:6:5:-:1 \@P2 LDG.E.CI$dtype p1q0, [track2];\n", + + j3c15 => "--:-:-:-:1 PSETP.OR.AND P4, PT, P5, P6, PT;\n" . + "--:-:-:-:1 IADD n, n, param_loopN;\n" . + "--:-:-:-:1 IADD offset, offset, param_loopN;\n", + + j3c16 => "--:-:-:-:1 \@!P0 I2I.U32.U32 p0q0, RZ;\n", + j3c20 => "--:-:-:-:1 \@!P1 I2I.U32.U32 p0q1, RZ;\n", + j3c24 => "--:-:-:-:1 \@!P2 I2I.U32.U32 p1q0, RZ;\n", + j3c28 => "--:-:-:-:1 \@!P3 I2I.U32.U32 p1q1, RZ;\n", + + j3c25 => "--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, P4;\n", + + + j3c38 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n", + + + j3c63 => "--:-:-:Y:5 \@P4 BRA.U ERROR_LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 3) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 4; + my $bankOffset = $IX ? 0 : 8; + + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy0, [readIs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, $bankOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 LDS.U.128 j%dEx0, [readEs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, 8; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy4, [readIs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, $bankOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 LDS.U.128 j%dEx4, [readEs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, 8; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1; + + my $yield = $j < 3 && $stall && ($c % 3 == 0) ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +// Advance x offset/preds + +--:-:-:-:1 IADD gxs, gxs, param_strideX; +--:-:-:-:1 IADD offset, offset, param_loopX; +// Extract y + init + buffer bits +--:-:-:-:1 BFE.U32 mask_y, pred_bits, 0x704; +--:-:-:-:1 R2P PR, mask_y, 0x0c; +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:1 SHL x, gxs, param_shiftX; +01:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 ISCADD x, super_x, x, 1; +--:-:-:-:1 BFE.U32 n, tid, param_superN; +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_Q, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_Q, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x03; +--:-:-:-:1 SEL pred_bits, mask_x, RZ, P2; +--:-:-:-:1 @P3 BFI pred_bits, mask_x, 0x202, pred_bits; +--:-:-:-:0 BFI pred_bits, mask_y, 0x704, pred_bits; + + +--:-:-:Y:5 @P5 BRA.U ERROR_LOOP; + +// Advance y offset/preds +--:-:-:-:1 IADD gys, gys, param_strideY; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; +--:-:-:-:1 LDS gxs, [addr_blk_Q]; +--:-:-:-:0 BFE.U32 init, pred_bits, 0x308; +--:-:1:-:1 LDS blkK, [addr_blk_K]; +--:-:-:-:2 PSETP.AND.AND P5, PT, PT, PT, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, gys, param_GYS, PT; +--:-:-:-:5 CAL ERROR_OFFSET; +--:-:-:-:0 BFI pred_bits, init, 0x308, pred_bits; +--:-:-:Y:5 @P6 BRA.U ERROR_LOOP; + +// Set n to loop remaining times +--:-:-:-:1 SHR.U32 pred_bits, init, 2; +--:-:-:-:1 MOV nloop, param_loopN; +--:-:-:-:1 MOV N, param_N; +--:-:-:Y:c LOP.AND.NZ P5, init, init, 3; +--:-:-:-:1 SHL pred_bits, pred_bits, 10; +--:-:-:-:0 VMAD.U16.U16 n, -init, nloop, N; +--:-:-:Y:5 @P5 BRA.U ERROR_LOOP; + +END_LOOP: + +// K_blk, C_blk, P_blk, Q_blk +--:-:1:-:1 LDS.U.128 blkKCPQ, [addr_blk_K]; + + +--:-:-:-:1 MOV alpha, param_alpha; + +// Strip double buffering offsets, and the batch dimension on readIs +// This gives us the shared memory write mapping for the thread's registers: +// readEs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readIs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid_16, tid, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; + +--:-:-:-:1 LOP.AND tid_1, tid, 1; +--:-:-:-:1 LOP.AND readIs, tid, 8; +--:-:-:-:1 SHR.U32 readIs, readIs, 2; +--:-:-:-:1 LOP.OR readIs, readIs, tid_1; +--:-:-:-:1 SHL readIs, readIs, 4; + +--:-:-:-:1 BFE.U32 readEs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readEs, readEs, tid_16; +--:-:-:-:1 SHL readEs, readEs, 4; + +// writeCs = readIs * 512 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 9; + +// readCs = tid//32 * 512 + tid & 31 +--:-:-:-:1 LOP.AND tid_31, tid, 31; +--:-:-:-:1 SHR.U32 tid_32, tid, 5; +--:-:-:-:1 ISCADD readCs, tid_32, tid_31, 9; +--:-:-:-:1 SHL readCs, readCs, 2; + +// kk = K_blk*32 + tid&31 +01:-:-:-:1 ISCADD kk, K_blk, tid_31, 5; + +// cc = C_blk*32 + tid//32 +--:-:-:-:1 ISCADD cc, C_blk, tid_32, 5; + +// F00 = c*RSK + r*SK + s*K + k +--:-:-:-:1 XMAD.LO2C trackF, cc, param_RSK, kk; + +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSK, param_CRSK; +01:-:-:-:1 XMAD PQ_blk, P_blk, param_strideX, Q_blk; +--:-:-:-:1 XMAD.LO trackF, PQ_blk, CRSK, trackF, xmad_determ; + }; + } + return ''; ++] + +--:-:-:-:1 LEA F00_0.CC, trackF, param_F[0], 2; +--:-:-:-:1 LEA.HI.X F00_1, trackF, param_F[1], RZ, 2; + +--:-:-:-:1 MOV K1, param_K; +--:-:-:-:1 SHL K1, K1, 2; + +--:-:-:-:1 MOV SK1, param_SK; +--:-:-:-:1 SHL SK1, SK1, 2; + +--:-:-:-:1 MOV RSK8, param_RSK; +--:-:-:-:1 SHL RSK8, RSK8, 5; + +--:-:-:-:1 ISETP.LT.AND P0, PT, kk, param_K, PT; + + +--:-:-:-:6 IADD F01_0.CC, F00_0, K1; +--:-:-:-:1 IADD.X F01_1, F00_1, RZ; +--:-:-:-:6 IADD F02_0.CC, F01_0, K1; +--:-:-:-:1 IADD.X F02_1, F01_1, RZ; + +--:-:-:-:6 IADD F10_0.CC, F00_0, SK1; +--:-:-:-:1 IADD.X F10_1, F00_1, RZ; +--:-:-:-:6 IADD F11_0.CC, F01_0, SK1; +--:-:-:-:1 IADD.X F11_1, F01_1, RZ; +--:-:-:-:6 IADD F12_0.CC, F02_0, SK1; +--:-:-:-:1 IADD.X F12_1, F02_1, RZ; + +--:-:-:-:6 IADD F20_0.CC, F10_0, SK1; +--:-:-:-:1 IADD.X F20_1, F10_1, RZ; +--:-:-:-:6 IADD F21_0.CC, F11_0, SK1; +--:-:-:-:1 IADD.X F21_1, F11_1, RZ; +--:-:-:-:6 IADD F22_0.CC, F12_0, SK1; +--:-:-:-:1 IADD.X F22_1, F12_1, RZ; + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*512 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*512 + 16>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*512 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*512 + 16>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:0 LOP.XOR readCs, readCs, 4x<8*512>; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*512 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*512 + 16>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*512 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*512 + 16>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:0 LOP.XOR readCs, readCs, 4x<8*512>; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:0 LOP.XOR readCs, readCs, 4x<8*512>; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + +--:-:-:-:0 ISETP.LT.AND P1, PT, cc, param_C, P0; // cc < C && kk < K +--:-:-:-:1 LDS m00, [readCs + 4x< 0*32>]; +--:-:-:-:1 LDS m10, [readCs + 4x< 4*32>]; +--:-:-:-:1 LDS m01, [readCs + 4x< 1*32>]; +--:-:1:-:1 LDS m11, [readCs + 4x< 5*32>]; + +--:-:-:-:0 IADD cc, cc, 8; +--:-:-:-:1 LDS m21, [readCs + 4x< 9*32>]; +--:-:-:-:1 LDS m02, [readCs + 4x< 2*32>]; +--:-:-:-:1 LDS m12, [readCs + 4x< 6*32>]; +--:-:2:-:1 LDS m22, [readCs + 4x<10*32>]; + +--:-:-:-:1 LDS m31, [readCs + 4x<13*32>]; +--:-:-:-:1 LDS m20, [readCs + 4x< 8*32>]; +--:-:-:-:1 LDS m32, [readCs + 4x<14*32>]; +--:-:3:-:1 LDS m03, [readCs + 4x< 3*32>]; + +--:-:-:-:1 LDS m13, [readCs + 4x< 7*32>]; +--:-:-:-:1 LDS m23, [readCs + 4x<11*32>]; +--:-:-:-:1 LDS m30, [readCs + 4x<12*32>]; +--:-:4:-:1 LDS m33, [readCs + 4x<15*32>]; + +01:-:-:-:1 FADD t00, m00, m10; +--:-:-:-:1 FADD t01, m01, m11; +02:-:-:-:1 FADD t21, m11, m21; +--:-:-:-:1 FADD t02, m02, m12; +--:-:-:-:1 FADD t11, m11, -m21; +--:-:-:-:1 FADD t22, m12, m22; +--:-:-:-:1 FADD t12, m12, -m22; +--:-:-:-:1 FADD t01, t01, m21; +04:-:-:-:1 FADD t21, t21, m31; +--:-:-:-:1 FADD t02, t02, m22; +--:-:-:-:1 FADD t20, m10, m20; +--:-:-:-:1 FADD t22, t22, m32; +--:-:-:-:1 FADD t00, t00, m20; +08:-:-:-:1 FADD t03, m03, m13; +--:-:-:-:1 FADD t10, m10, -m20; +--:-:-:-:1 FADD t23, m13, m23; +--:-:-:-:1 FADD t20, t20, m30; +--:-:-:-:1 FADD t13, m13, -m23; +--:-:-:-:1 FADD f00, t00, t01; +--:-:-:-:1 FADD t03, t03, m23; +--:-:-:-:1 FADD f02, t01, t02; +--:-:-:-:1 FADD t23, t23, m33; +--:-:-:-:1 FADD f10, t10, t11; +--:-:-:-:1 FADD f12, t11, t12; +--:-:-:-:1 FADD f20, t20, t21; +--:-:-:-:1 FADD f22, t21, t22; +--:-:-:-:1 FADD f00, f00, t02; +--:-:-:-:1 FADD f01, t01, -t02; +--:-:-:-:0 FADD f02, f02, t03; +--:-:-:-:1 @P1 [+ output_op() +] [F00_0], f00; +--:-:-:-:0 FADD f10, f10, t12; +--:-:-:-:1 @P1 [+ output_op() +] [F01_0], f01; +--:-:-:-:0 FADD f11, t11, -t12; +--:1:-:-:1 @P1 [+ output_op() +] [F02_0], f02; +--:-:-:-:0 FADD f12, f12, t13; +--:-:-:-:1 @P1 [+ output_op() +] [F10_0], f10; +--:-:-:-:0 FADD f20, f20, t22; +--:-:-:-:1 @P1 [+ output_op() +] [F11_0], f11; +--:-:-:-:0 FADD f21, t21, -t22; +--:2:-:-:1 @P1 [+ output_op() +] [F12_0], f12; +--:-:-:-:0 FADD f22, f22, t23; +--:-:-:-:1 @P1 [+ output_op() +] [F20_0], f20; +--:-:-:-:1 @P1 [+ output_op() +] [F21_0], f21; +--:3:-:-:1 @P1 [+ output_op() +] [F22_0], f22; + +01:-:-:-:6 IADD F00_0.CC, F00_0, RSK8; +--:-:-:-:1 IADD.X F00_1, F00_1, RZ; +--:-:-:-:6 IADD F01_0.CC, F01_0, RSK8; +--:-:-:-:1 IADD.X F01_1, F01_1, RZ; +--:-:-:-:6 IADD F02_0.CC, F02_0, RSK8; +--:-:-:-:1 IADD.X F02_1, F02_1, RZ; +02:-:-:-:6 IADD F10_0.CC, F10_0, RSK8; +--:-:-:-:1 IADD.X F10_1, F10_1, RZ; +--:-:-:-:6 IADD F11_0.CC, F11_0, RSK8; +--:-:-:-:1 IADD.X F11_1, F11_1, RZ; +--:-:-:-:6 IADD F12_0.CC, F12_0, RSK8; +--:-:-:-:1 IADD.X F12_1, F12_1, RZ; +04:-:-:-:6 IADD F20_0.CC, F20_0, RSK8; +--:-:-:-:1 IADD.X F20_1, F20_1, RZ; +--:-:-:-:6 IADD F21_0.CC, F21_0, RSK8; +--:-:-:-:1 IADD.X F21_1, F21_1, RZ; +--:-:-:-:6 IADD F22_0.CC, F22_0, RSK8; +--:-:-:-:0 IADD.X F22_1, F22_1, RZ; + +--:-:-:-:5 RET; + diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_3x3_4x4_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_3x3_4x4_32x32.sass new file mode 100644 index 0000000..20e8a9d --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_winograd_3x3_4x4_32x32.sass @@ -0,0 +1,1047 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our ($type, $D); +our $determ = $D; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $dtype_shift = $type eq 'h' ? '1' : '2'; +our $dtype_size = $type eq 'h' ? '2' : '4'; +our $vec_size = $type eq 'h' ? '64' : '128'; +sub dtype_shift { return $dtype_shift; } +sub vec_size { return $vec_size; } +sub output_op { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; } +-] + + + + addr_zero : 4x<32*36*2*4 + 64 + 0> + addr_rYXN : 4x<32*36*2*4 + 64 + 4> + addr_iYXN : 4x<32*36*2*4 + 64 + 5> + addr_idx_K : 4x<32*36*2*4 + 64 + 6> + addr_idx_C : 4x<32*36*2*4 + 64 + 7> + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_K : c[0x0][0x15c] + param_C : c[0x0][0x160] + param_k : c[0x0][0x164] + param_c : c[0x0][0x168] + param_kc : c[0x0][0x16c] + param_magic_kc : c[0x0][0x170] + param_shift_kc : c[0x0][0x174] + param_magic_c : c[0x0][0x178] + param_shift_c : c[0x0][0x17c] + param_YXN2 : c[0x0][0x180] + param_sYXN : c[0x0][0x184] + param_magic_sYXN : c[0x0][0x188] + param_shift_sYXN : c[0x0][0x18c] + param_stride_YXNp : c[0x0][0x190] + param_YXN : c[0x0][0x194] + param_YXN_1152 : c[0x0][0x198] + param_RSK : c[0x0][0x19c] + param_CRSK : c[0x0][0x1a0] + param_Kp : c[0x0][0x1a4] + param_SKp : c[0x0][0x1a8] + param_RSK15_SK2p : c[0x0][0x1ac] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10 : clx<0-3>y0 + 7, 6,15,14 : clx<0-3>y1 + 1, 0, 9, 8 : clx<0-3>y2 + 5, 4,13,12 : clx<0-3>y3 + 19,18,27,26 : clx<0-3>y4 + 23,22,31,30 : clx<0-3>y5 + 17,16,25,24 : clx<0-3>y6 + 21,20,29,28 : clx<0-3>y7 + + 32-43 : jl0Fx<0-3>, jl0Iy<0-7> + 44-51 : jl1Fx<0-3>, jl1Iy<4-7> + 36-39 : jl1Iy<0-3> + + 52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3> + 88-89 : track<0-1> + 90-91 ~ writeS + + 32-86 ~ idx_YXNkc, idx_K, idx_C, idx_YXN, div<1-3>, magic_kc, neg_kc, idx_kc, idx_k, idx_c, YXN2_idx, neg_sYXN, magic_sYXN, remainder, yxn, offset, offset2, tid32_2, tid1, tid31 + 87 = tid + + 32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1 + 48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16 + + + 3, 2,11,10,19,18,27,26 : ccx<0-7>y0 + 7, 6,15,14,23,22,31,30 : ccx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2 + 5, 4,13,12,21,20,29,28 : ccx<0-7>y3 + 35,34,43,42,51,50,59,58 : ccx<0-7>y4 + 39,38,47,46,55,54,63,62 : ccx<0-7>y5 + 33,32,41,40,49,48,57,56 : ccx<0-7>y6 + 37,36,45,44,53,52,61,60 : ccx<0-7>y7 + + 64-79 : jc0Fx<0-7>, jc0Iy<0-7> + 80-91 : jc1Fx<4-7>, jc1Iy<0-7> + 64-67 : jc1Fx<0-3> + + 64-86 ~ tid16, tid_1, tid128 + + 92-95 ~ reduce_YXN, swapBuf, readFs, readIs + + + 64-89 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxC, idxK, idxI, readFs2, readIs2, offsetF, k, CRSK, xmad_determ + 86-89 : Out1<0-1>, Out2<0-1> + 90-91 : Out0<0-1> + 92-95 ~ alpha, writeCs, readCs, c + + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + + 84-85 ~ t<0-1> + + 3, 2,11,19,10,18 : m<0-5>0 + 1, 9, 0, 8,17,16 : m<0-5>1 + 27,26,25,24,64,65 : m<0-5>2 + 66,67,68,69,70,71 : m<0-5>3 + 72,73,74,75,76,77 : m<0-5>4 + 78,79,80,81,82,83 : m<0-5>5 + + 3, 2,11 : w00, w10, w20 + 1, 9, 0 : w01, w11, w21 + 27,26,25 : w02, w12, w22 + 66,67,68 : w03, w13, w23 + 72,73,74 : w04, w14, w24 + 78,79,80 : w05, w15, w25 + + 19,10,18,69,70,71 ~ s00, s10, s20 + 8,17,16,75,76,77 ~ s02, s12, s22 + 24,64,65,81,82,83 ~ s01, s11, s21 + + + +--:-:-:-:0 MOV swapBuf, 4x<32*36*2*2>; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:-:-:1 STS.128 [addr_zero], RZ; +01:-:-:Y:d ISETP.GE.AND P0, PT, tid, 128, PT; +--:-:-:-:5 @P0 BRA.U COMPUTE_SETUP; + +############################################################## +LOAD_SETUP: + +--:-:1:-:1 S2R idx_YXNkc, SR_CTAID.X; +--:-:2:-:1 S2R idx_K, SR_CTAID.Z; +--:-:3:-:1 S2R idx_C, SR_CTAID.Y; + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +] + +--:-:-:-:1 ISETP.EQ.AND P0, PT, tid, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, tid, 64, PT; + +// idx_YXN = idx_YXNkc / blk_kc +--:-:-:-:1 MOV magic_kc, param_magic_kc; +--:-:-:-:1 IADD neg_kc, RZ, -param_kc; +--:-:-:-:1 ISETP.NE.AND P2, PT, magic_kc, 1, PT; +01:-:-:-:1 @P2 XMAD div1, idx_YXNkc, magic_kc, RZ; +--:-:-:-:1 @P2 XMAD div2, idx_YXNkc, magic_kc.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, idx_YXNkc.H1, magic_kc.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, idx_YXNkc.H1, magic_kc, div1; +--:-:-:-:1 @P2 IADD3.RS idx_YXN, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 idx_YXN, idx_YXN, param_shift_kc; +--:-:-:-:1 @!P2 SHR.U32 idx_YXN, idx_YXNkc, param_shift_kc; + +// idx_kc = idx_YXNkc % blk_kc +--:-:-:-:1 XMAD.LO2 idx_kc, neg_kc, idx_YXN, idx_YXNkc; + +// idx_k = idx_kc / blk_c +// idx_c = idx_kc % blk_c +--:-:-:-:1 XMAD idx_k, idx_kc, param_magic_c, RZ; +--:-:-:-:1 SHR.U32 idx_k, idx_k, param_shift_c; +--:-:-:-:1 XMAD idx_c, idx_k, param_c, RZ; +--:-:-:-:1 IADD idx_c, -idx_c, idx_kc; + +// idx_K = idx_K * blk_k + idx_k +// idx_C = idx_C * blk_c + idx_c +02:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; +04:-:-:-:1 XMAD idx_C, idx_C, param_c, idx_c; + +// reduce_YXN = ceil((YXN2 - idx_YXN) / sYXN) +--:-:-:-:1 IADD YXN2_idx, -idx_YXN, param_YXN2; +--:-:-:-:1 IADD neg_sYXN, RZ, -param_sYXN; +--:-:-:-:1 MOV magic_sYXN, param_magic_sYXN; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_sYXN, 1, PT; +--:-:-:-:1 @P3 XMAD div1, YXN2_idx, magic_sYXN, RZ; +--:-:-:-:1 @P3 XMAD div2, YXN2_idx, magic_sYXN.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, YXN2_idx.H1, magic_sYXN.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, YXN2_idx.H1, magic_sYXN, div1; +--:-:-:-:1 @P3 IADD3.RS reduce_YXN, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 reduce_YXN, reduce_YXN, param_shift_sYXN; +--:-:-:-:1 @!P3 SHR.U32 reduce_YXN, YXN2_idx, param_shift_sYXN; + +--:-:-:-:1 XMAD.LO2 remainder, neg_sYXN, reduce_YXN, YXN2_idx; +--:-:-:-:1 IMNMX.U32 remainder, remainder, 1, PT; +--:-:-:-:1 IADD reduce_YXN, reduce_YXN, remainder; + +--:-:-:-:1 @P0 STS [addr_iYXN], idx_YXN; +--:-:-:-:1 @P0 STS [addr_idx_K], idx_K; +--:-:-:-:1 @P0 STS [addr_idx_C], idx_C; +--:6:-:-:1 @P0 STS [addr_rYXN], reduce_YXN; + +// yxn = (tid & 63) >> 5 +--:-:-:-:1 BFE.U32 yxn, tid, 0x105; // 1 bit at position 5 + +// offset = (idx_YXN + (reduce_YXN - 1)*sYXN)*2 + yxn +--:-:-:-:1 IADD offset, reduce_YXN, -1; +--:-:-:-:1 XMAD offset2, offset, param_sYXN, idx_YXN; +--:-:-:-:1 XMAD.PSL offset2, offset.H1, param_sYXN, offset2; +--:-:-:-:1 ISCADD offset2, offset2, yxn, 1; + +// P6 = offset < YXN +--:-:-:-:1 ISETP.LT.AND P6, PT, offset2, param_YXN, PT; + +// P5 = reduce_YXN > 1 +--:-:-:-:1 ISETP.GT.AND P5, PT, reduce_YXN, 1, PT; + +--:-:-:-:1 LOP.AND tid32_2, tid, -32; +--:-:-:-:1 SHR.U32 tid32_2, tid32_2, 2; + +// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7) +--:-:-:-:1 BFE.U32 readFs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readFs, readFs, tid32_2; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*36*2>, 4; + +// readIs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readIs, tid, 16; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 IADD3 readIs, readIs, tid1, tid32_2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// writeS = (yxn*32*36 + (tid & 31)*4)*4 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL writeS, tid31, 4; +--:-:-:-:1 XMAD writeS, yxn, 4x<32*36>, writeS; + +// offset = offset*32*36 + tid31*4 +--:-:-:-:1 SHL tid31, tid31, 2; +--:-:-:-:0 XMAD.LO2 offset, offset2, 1x<32*36>, tid31; + + +--:-:-:-:6 @P1 BRA.U FILTER_SETUP; + +############################################################## +IMAGE_SETUP: + + +// (GC32,GY,GX,N,6,6,32) +// offset += idx_C * YXN*32*36 +--:-:-:-:1 XMAD.LO2C offset, idx_C, param_YXN_1152, offset; + +--:-:-:-:1 LEA track0.CC, offset, param_I[0], [+ dtype_shift() +]; +--:-:-:-:0 LEA.HI.X track1, offset, param_I[1], RZ, [+ dtype_shift() +]; + + +--:-:-:-:6 BRA.U LOAD; + +############################################################## +FILTER_SETUP: + + +// writeS += 32*36*2*4 +--:-:-:-:1 IADD writeS, writeS, 4x<32*36*2>; + +// (GK32,GY,GX,N,6,6,32) +// offset += idx_K * YXN*32*36 +--:-:-:-:1 XMAD.LO2C offset, idx_K, param_YXN_1152, offset; + +--:-:-:-:1 LEA track0.CC, offset, param_E[0], [+ dtype_shift() +]; +--:-:-:-:2 LEA.HI.X track1, offset, param_E[1], RZ, [+ dtype_shift() +]; + + +############################################################## +LOAD: + +20:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>]; +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>]; +--:-:2:-:1 @P6 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T0, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T1, [addr_zero]; +--:-:2:-:1 @!P6 LDS.U.[+ vec_size() +] T2, [addr_zero]; + +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>]; +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>]; +--:-:3:-:1 @P6 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T3, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T4, [addr_zero]; +--:-:3:-:1 @!P6 LDS.U.[+ vec_size() +] T5, [addr_zero]; + +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>]; +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>]; +--:-:4:-:1 @P6 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T6, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T7, [addr_zero]; +--:-:4:-:1 @!P6 LDS.U.[+ vec_size() +] T8, [addr_zero]; + +[+ + our $convert_in; + return $convert_in ? q{ + +02:-:-:-:1 F2F.F32.F16 T03, T01.H1; +--:-:-:-:1 F2F.F32.F16 T02, T01.H0; +--:-:-:-:1 F2F.F32.F16 T01, T00.H1; +--:-:2:-:1 F2F.F32.F16 T00, T00.H0; + +--:-:-:-:1 F2F.F32.F16 T13, T11.H1; +--:-:-:-:1 F2F.F32.F16 T12, T11.H0; +--:-:-:-:1 F2F.F32.F16 T11, T10.H1; +--:-:5:-:1 F2F.F32.F16 T10, T10.H0; + +--:-:-:-:1 F2F.F32.F16 T23, T21.H1; +--:-:-:-:1 F2F.F32.F16 T22, T21.H0; +--:-:-:-:1 F2F.F32.F16 T21, T20.H1; +--:-:6:-:1 F2F.F32.F16 T20, T20.H0; + +02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0; + +04:-:-:-:1 F2F.F32.F16 T33, T31.H1; +--:-:-:-:1 F2F.F32.F16 T32, T31.H0; +--:-:-:-:1 F2F.F32.F16 T31, T30.H1; +--:-:3:-:1 F2F.F32.F16 T30, T30.H0; + +10:-:-:-:1 STS.128 [writeS + 4x<1*32*4>], T1; + +--:-:-:-:1 F2F.F32.F16 T43, T41.H1; +--:-:-:-:1 F2F.F32.F16 T42, T41.H0; +--:-:-:-:1 F2F.F32.F16 T41, T40.H1; +--:-:5:-:1 F2F.F32.F16 T40, T40.H0; + +20:-:-:-:1 STS.128 [writeS + 4x<2*32*4>], T2; + +--:-:-:-:1 F2F.F32.F16 T53, T51.H1; +--:-:-:-:1 F2F.F32.F16 T52, T51.H0; +--:-:-:-:1 F2F.F32.F16 T51, T50.H1; +--:-:6:-:1 F2F.F32.F16 T50, T50.H0; + +04:-:-:-:1 STS.128 [writeS + 4x<3*32*4>], T3; + +08:-:-:-:1 F2F.F32.F16 T63, T61.H1; +--:-:-:-:1 F2F.F32.F16 T62, T61.H0; +--:-:-:-:1 F2F.F32.F16 T61, T60.H1; +--:-:4:-:1 F2F.F32.F16 T60, T60.H0; + +10:-:-:-:1 STS.128 [writeS + 4x<4*32*4>], T4; + +--:-:-:-:1 F2F.F32.F16 T73, T71.H1; +--:-:-:-:1 F2F.F32.F16 T72, T71.H0; +--:-:-:-:1 F2F.F32.F16 T71, T70.H1; +--:-:5:-:1 F2F.F32.F16 T70, T70.H0; + +20:-:-:-:1 STS.128 [writeS + 4x<5*32*4>], T5; + +--:-:-:-:1 F2F.F32.F16 T83, T81.H1; +--:-:-:-:1 F2F.F32.F16 T82, T81.H0; +--:-:-:-:1 F2F.F32.F16 T81, T80.H1; +--:-:6:-:1 F2F.F32.F16 T80, T80.H0; + +08:-:-:-:1 STS.128 [writeS + 4x<6*32*4>], T6; +10:-:-:-:1 STS.128 [writeS + 4x<7*32*4>], T7; +20:-:-:-:1 STS.128 [writeS + 4x<8*32*4>], T8; + + } : q{ +02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0; +--:-:-:-:1 STS.128 [writeS + 4x<1*32*4>], T1; +--:-:-:-:1 STS.128 [writeS + 4x<2*32*4>], T2; +04:-:-:-:1 STS.128 [writeS + 4x<3*32*4>], T3; +--:-:-:-:1 STS.128 [writeS + 4x<4*32*4>], T4; +--:-:-:-:1 STS.128 [writeS + 4x<5*32*4>], T5; +08:-:-:-:1 STS.128 [writeS + 4x<6*32*4>], T6; +--:-:-:-:1 STS.128 [writeS + 4x<7*32*4>], T7; +--:-:-:-:1 STS.128 [writeS + 4x<8*32*4>], T8; + }; ++] + +--:-:-:-:0 IADD track0.CC, track0, -param_stride_YXNp; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:1 LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>]; +--:-:1:-:1 LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>]; + +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>]; +--:-:2:-:1 @P5 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>]; +--:-:3:-:1 @P5 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>]; +--:6:4:-:1 @P5 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>]; + +--:-:-:-:5 BRA.U LOAD_LOOP; + +############################################################## + +COMPUTE_SETUP: + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +--:-:-:-:1 IADD tid128, tid, -128; + +// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readIs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid128, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readFs, tid128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readFs, readFs, tid16; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*4 + 32*36*2>, 4; + +--:-:-:-:1 LOP.AND tid_1, tid128, 1; +--:-:-:-:1 LOP.AND readIs, tid128, 8; +--:-:-:-:1 SHR.U32 readIs, readIs, 2; +--:-:-:-:1 IADD3 readIs, readIs, tid16, tid_1; +--:-:-:-:0 ISCADD readIs, readIs, 4x<32*4>, 4; + + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS reduce_YXN, [addr_rYXN]; + +--:-:-:-:1 LDS.U.128 jc0Iy0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Fx0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Iy4, [readIs + 4x<0*32*36 + 16>]; +--:-:1:-:2 LDS.U.128 jc0Fx4, [readFs + 4x<0*32*36 + 16>]; + +COMPUTE_LOOP: +[+ + my %insert = ( + + j0c33 => "--:-:-:-:1 ISETP.GT.AND P0, PT, reduce_YXN, 1, PT;\n" . + "--:-:-:-:1 IADD reduce_YXN, reduce_YXN, -1;\n", + + j0c62 => "02:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readIs, readIs, swapBuf;\n" . + "--:-:-:-:1 IADD readFs, readFs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j1c63 => "--:-:-:Y:5 \@P0 BRA.U COMPUTE_LOOP;\n" . + "--:-:-:Y:5 BRA.U COMPUTE_FINISH;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 1) + { + my $odd = $j; + my $nOdd = 1 - $j; + my $rsPred = $j == 1 ? '@P0' : ' '; + my $bar = $j == 0 ? '2' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIy4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFx4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIy0, [readIs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd; + + $insert{"j${j}c31"} = sprintf "--:%s:1:-:1 %s LDS.U.128 jc%dFx0, [readFs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd; + + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $yield = $c % 10 == 0 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA ccx%dy%d, jc%dFx%d, jc%dIy%d, ccx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +LOAD_LOOP: +--:-:-:-:1 ISETP.GT.AND P0, PT, reduce_YXN, 1, PT; +20:-:-:-:1 IADD track0.CC, track0, -param_stride_YXNp; +--:-:-:-:1 ISETP.GT.AND P1, PT, reduce_YXN, 2, PT; +--:-:-:-:1 IADD reduce_YXN, reduce_YXN, -1; +[+ + our ($vec_size, $dtype_size, $convert_in); + my %insert = ( + + j0c3 => "--:-:-:-:1 IADD.X track1, track1, -RZ;\n", + + j0c0 => "--:-:-:-:1 LDS.U.128 jl1Iy4, [readIs + 4x<1*32*36 + 16>];\n", + j0c2 => "--:-:-:-:1 LDS.U.128 jl1Fx0, [readFs + 4x<1*32*36 + 00>];\n", + j0c18 => "--:-:1:-:1 LDS.U.128 jl1Iy0, [readIs + 4x<1*32*36 + 00>];\n", + + j1c12 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>];\n", + j1c14 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>];\n", + j1c16 => "--:-:1:-:1 \@P0 LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>];\n", + + $convert_in ? ( + + j0c1 => "02:-:-:-:1 F2F.F32.F16 T03, T01.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T02, T01.H0;\n", + j0c4 => "--:-:-:-:1 F2F.F32.F16 T01, T00.H1;\n" . + "--:-:2:-:1 F2F.F32.F16 T00, T00.H0;\n", + + j0c5 => "--:-:-:-:1 F2F.F32.F16 T13, T11.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T12, T11.H0;\n", + j0c6 => "--:-:-:-:1 F2F.F32.F16 T11, T10.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T10, T10.H0;\n", + + j0c7 => "--:-:-:-:1 F2F.F32.F16 T23, T21.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T22, T21.H0;\n", + j0c8 => "--:-:-:-:1 F2F.F32.F16 T21, T20.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T20, T20.H0;\n", + + j0c9 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n", + j0c10 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n", + j0c11 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n", + + j0c13 => "02:-:-:-:1 \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n", + j0c14 => "10:-:-:-:1 \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n", + j0c15 => "20:-:2:-:1 \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n", + + j0c16 => "04:-:-:-:1 F2F.F32.F16 T33, T31.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T32, T31.H0;\n", + j0c17 => "--:-:-:-:1 F2F.F32.F16 T31, T30.H1;\n" . + "--:-:3:-:1 F2F.F32.F16 T30, T30.H0;\n", + + j0c19 => "--:-:-:-:1 F2F.F32.F16 T43, T41.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T42, T41.H0;\n", + j0c20 => "--:-:-:-:1 F2F.F32.F16 T41, T40.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T40, T40.H0;\n", + + j0c21 => "--:-:-:-:1 F2F.F32.F16 T53, T51.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T52, T51.H0;\n", + j0c22 => "--:-:-:-:1 F2F.F32.F16 T51, T50.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T50, T50.H0;\n", + + j0c23 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n", + j0c24 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n", + j0c25 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n", + + j0c27 => "04:-:-:-:1 \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n", + j0c28 => "10:-:-:-:1 \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n", + j0c29 => "20:-:3:-:1 \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n", + + j0c30 => "08:-:-:-:1 F2F.F32.F16 T63, T61.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T62, T61.H0;\n", + j0c31 => "--:-:-:-:1 F2F.F32.F16 T61, T60.H1;\n" . + "--:-:4:-:1 F2F.F32.F16 T60, T60.H0;\n", + + j1c0 => "--:-:-:-:1 F2F.F32.F16 T73, T71.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T72, T71.H0;\n", + j1c1 => "--:-:-:-:1 F2F.F32.F16 T71, T70.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T70, T70.H0;\n", + + j1c2 => "--:-:-:-:1 F2F.F32.F16 T83, T81.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T82, T81.H0;\n", + j1c3 => "--:-:-:-:1 F2F.F32.F16 T81, T80.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T80, T80.H0;\n", + + j1c4 => "08:4:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n", + j1c5 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n", + j1c6 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n", + + j1c8 => "08:-:-:-:1 \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n", + j1c9 => "10:-:-:-:1 \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n", + j1c10 => "20:6:4:-:1 \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n", + + ) : ( + + j0c6 => "02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0;\n", + j0c8 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n", + j0c10 => "--:2:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n", + + j0c12 => "02:-:-:-:1 \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n", + j0c14 => "--:-:-:-:1 \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n", + j0c16 => "--:-:2:-:1 \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n", + + j0c20 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n", + j0c22 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n", + j0c24 => "--:3:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n", + + j0c26 => "04:-:-:-:1 \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n", + j0c28 => "--:-:-:-:1 \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n", + j0c30 => "--:-:3:-:1 \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n", + + j1c0 => "08:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n", + j1c2 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n", + j1c4 => "--:4:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n", + + j1c6 => "08:-:-:-:1 \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n", + j1c8 => "--:-:-:-:1 \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n", + j1c10 => "--:6:4:-:1 \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n", + ), + + j1c11 => "--:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j1c31 => "--:-:-:Y:5 \@P0 BRA.U LOAD_LOOP;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "01" : '--'; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dFx%d, jl%dIy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + +--:-:1:-:2 S2R Tid, SR_TID.X; + +--:-:-:-:1 MOV alpha16, param_alpha; + +01:-:-:-:1 LOP.AND Tid32_2, Tid, -32; +--:-:-:-:1 SHR.U32 Tid32_2, Tid32_2, 2; + +// readIs = ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND Tid1, Tid, 1; +01:-:-:-:1 LOP.AND readIs, Tid, 16; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 IADD readIs, readIs, Tid1; + +// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readIs << 2) +--:-:-:-:1 BFE.U32 readFs, Tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readFs, readFs, Tid32_2; +--:-:-:-:1 ISCADD readFs, readIs, readFs, 2; + +--:-:-:-:1 SHL readFs, readFs, 4; +--:-:-:-:1 SHL readIs, readIs, 3; + +// writeCs = readIs * 32*36 + readFs; +--:-:-:-:1 XMAD write16Cs, readIs, 1x<32*36>, readFs; + + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y2, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y2, alpha16; +--:-:-:-:4 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y3, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y3, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y6, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y6, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y7, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y7, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 EXIT; + +COMPUTE_FINISH: + +--:-:1:-:2 S2R tid_128, SR_TID.X; + + +--:-:-:-:1 MOV alpha, param_alpha; + +01:-:-:-:1 IADD tid_128, tid_128, -128; + +--:-:-:-:1 ISETP.GE.AND P4, PT, tid_128, 256, PT; + +// readIs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND Tid_1, tid_128, 1; +--:-:-:-:1 LOP.AND readIs2, tid_128, 8; +--:-:-:-:1 SHR.U32 readIs2, readIs2, 2; +--:-:-:-:1 IADD readIs2, readIs2, Tid_1; + +// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readIs2 << 2) +--:-:-:-:1 LOP.AND tid_16, tid_128, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; +--:-:-:-:1 BFE.U32 readFs2, tid_128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readFs2, readFs2, tid_16; +--:-:-:-:1 ISCADD readFs2, readIs2, readFs2, 2; + +--:-:-:-:1 ISCADD readFs2, readFs2, 4x<32*4>, 4; +--:-:-:-:1 SHL readIs2, readIs2, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:0 XMAD writeCs, readIs2, 1x<32*36>, readFs2; + + +--:-:-:-:5 @P4 BRA.U SKIP0; + +--:-:1:-:1 LDS idxK, [addr_idx_K]; +--:-:2:-:1 LDS idxC, [addr_idx_C]; +[+ our $determ; return $determ ? q{--:-:3:-:1 LDS idxI, [addr_iYXN];} : ''; +] + + + +--:-:-:-:1 LOP.AND tid_31, tid_128, 31; +--:-:-:-:1 SHR.U32 tid_32, tid_128, 5; +--:-:-:-:1 SHR.U32 tid_64, tid_128, 6; + +// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16 +--:-:-:-:1 XMAD readCs, tid_32, 1x<32*36>, tid_31; +--:-:-:-:1 ISCADD readCs, tid_64, readCs, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// k = K_blk*32 + tid_31 +// c = C_blk*32 + tid_32<<1 +--:-:-:-:1 SHL tid_32, tid_32, 1; +01:-:-:-:1 ISCADD k, idxK, tid_31, 5; +02:-:-:-:1 ISCADD c, idxC, tid_32, 5; + + +// offsetF = c*RSK + r*SK + s*K + k +--:-:-:-:1 XMAD.LO2C offsetF, c, param_RSK, k; + +[+ + our $determ; + return $determ ? q{ +--:-:-:-:1 MOV CRSK, param_CRSK; +04:-:-:-:1 XMAD.LO offsetF, idxI, CRSK, offsetF, xmad_determ; + } : ''; ++] + +--:-:-:-:1 LEA Out00.CC, offsetF, param_F[0], 2; +--:-:-:-:1 LEA.HI.X Out01, offsetF, param_F[1], RZ, 2; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, PT; + + +SKIP0: + + +--:-:-:-:1 IADD Out10.CC, Out00, param_Kp; +--:-:-:-:1 IADD.X Out11, Out01, RZ; +--:-:-:-:1 IADD Out20.CC, Out10, param_Kp; +--:-:-:-:1 IADD.X Out21, Out11, RZ; + +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, ccx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, ccx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, ccx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP1; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +01:-:-:-:5 IADD Out00.CC, Out00, param_SKp; +--:-:-:-:1 IADD c, c, 1; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_SKp; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_SKp; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + +SKIP1: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y1, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP2; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +01:-:-:-:5 IADD Out00.CC, Out00, param_RSK15_SK2p; +--:-:-:-:1 IADD c, c, 15; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_RSK15_SK2p; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_RSK15_SK2p; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + +SKIP2: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y4, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y6, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP3; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +01:-:-:-:5 IADD Out00.CC, Out00, param_SKp; +--:-:-:-:1 IADD c, c, 1; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_SKp; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_SKp; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + +SKIP3: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y5, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP4; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP4: + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + +--:-:-:-:0 ISETP.LT.AND P1, PT, c, param_C, P0; + +[+ + my $out; + foreach my $i (0 .. 5) + { + foreach my $j (0 .. 5) + { + my $b = $j == 5 ? $i + 1 : '-'; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + +[+ + my $out; + foreach my $i (0 .. 5) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ +$w:-:-:-:1 FADD t0, m1$i, m2$i; +$w:-:-:-:1 FADD t1, m3$i, m4$i; +--:-:-:-:1 FADD m1$i, m1$i, -m2$i; +--:-:-:-:1 FADD m3$i, m3$i, -m4$i; +--:-:-:-:1 FADD w0$i, m0$i, t0; +--:-:-:-:1 FADD w0$i, w0$i, t1; +--:-:-:-:1 FMUL w1$i, m1$i, 0.625; +--:-:-:-:1 FFMA w1$i, m3$i, 1.5, w1$i; +--:-:-:-:1 FFMA w2$i, t1, 2.25, m5$i; +--:-:-:-:1 FFMA w2$i, t0, 0.390625, w2$i; + }; + } + return $out; ++] + + + +[+ + my $out; + foreach my $i (0 .. 2) + { + $out .= qq{ +--:-:-:-:1 FADD t0, w${i}1, w${i}2; +--:-:-:-:1 FADD t1, w${i}3, w${i}4; +--:-:-:-:1 FADD w${i}1, w${i}1, -w${i}2; +--:-:-:-:1 FADD w${i}3, w${i}3, -w${i}4; +--:-:-:-:1 FADD s${i}0, w${i}0, t0; +--:-:-:-:1 FADD s${i}0, s${i}0, t1; +--:-:-:-:1 FMUL s${i}1, w${i}1, 0.625; +--:-:-:-:1 FFMA s${i}1, w${i}3, 1.5, s${i}1; +--:-:-:-:1 FFMA s${i}2, t1, 2.25, w${i}5; +--:-:-:-:1 FFMA s${i}2, t0, 0.390625, s${i}2; + }; + } + return $out; ++] + +//--:-:1:-:1 I2F.F32.S32 temp, c; + + +--:1:-:-:1 @P1 [+ output_op() +] [Out0], s00; +--:2:-:-:1 @P1 [+ output_op() +] [Out1], s01; +--:3:-:-:1 @P1 [+ output_op() +] [Out2], s02; +01:-:-:-:6 IADD Out00.CC, Out00, param_SKp; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_SKp; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_SKp; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + + + +--:1:-:-:1 @P1 [+ output_op() +] [Out0], s10; +--:2:-:-:1 @P1 [+ output_op() +] [Out1], s11; +--:3:-:-:1 @P1 [+ output_op() +] [Out2], s12; +01:-:-:-:6 IADD Out00.CC, Out00, param_SKp; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_SKp; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_SKp; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + + + +--:1:-:-:1 @P1 [+ output_op() +] [Out0], s20; +--:2:-:-:1 @P1 [+ output_op() +] [Out1], s21; +--:3:-:-:1 @P1 [+ output_op() +] [Out2], s22; + + + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32.sass new file mode 100644 index 0000000..d4b2941 --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32.sass @@ -0,0 +1,1237 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $type; +our $dtype = $type eq 'h' ? 'U16' : '32'; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $dshift = $type eq 'h' ? '1' : '2'; +our $dsize = $type eq 'h' ? '2' : '4'; +our $vsize = $type eq 'h' ? '64' : '128'; +sub dtype { return $dtype; } +sub dsize { return $dsize; } +sub dshift { return $dshift; } +sub vsize { return $vsize; } +-] + + + + addr_zero : 4x<32*36*2*4 + 64 + 0> + addr_idx_Y : 4x<32*36*2*4 + 64 + 4> + addr_idx_X : 4x<32*36*2*4 + 64 + 5> + addr_idx_K : 4x<32*36*2*4 + 64 + 6> + + param_S[0] : c[0x0][0x140] + param_S[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_C : c[0x0][0x174] + param_K : c[0x0][0x178] + param_N : c[0x0][0x17c] + param_Y : c[0x0][0x180] + param_W : c[0x0][0x184] + param_YXN : c[0x0][0x188] + param_XN : c[0x0][0x18c] + param_Y2 : c[0x0][0x190] + param_GX : c[0x0][0x194] + param_Xk : c[0x0][0x198] + param_k : c[0x0][0x19c] + param_magic_Xk : c[0x0][0x1a0] + param_shift_Xk : c[0x0][0x1a4] + param_magic_k : c[0x0][0x1a8] + param_shift_k : c[0x0][0x1ac] + param_P : c[0x0][0x1b0] + param_Q : c[0x0][0x1b4] + param_QN : c[0x0][0x1b8] + param_PQN : c[0x0][0x1bc] + param_PQN15 : c[0x0][0x1c0] + param_maskN : c[0x0][0x1c4] + param_shiftX : c[0x0][0x1c8] + param_shiftY : c[0x0][0x1cc] + param_superX : c[0x0][0x1d0] + param_superY : c[0x0][0x1d4] + param_pad_x : c[0x0][0x1d8] + param_pad_y : c[0x0][0x1dc] + param_RSK : c[0x0][0x1e0] + param_RSK2p : c[0x0][0x1e4] + param_YXN2p : c[0x0][0x1e8] + param_gridN : c[0x0][0x1ec] + param_gridQN : c[0x0][0x1f0] + param_gridPQN : c[0x0][0x1f4] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10 : clx<0-3>y0 + 7, 6,15,14 : clx<0-3>y1 + 1, 0, 9, 8 : clx<0-3>y2 + 5, 4,13,12 : clx<0-3>y3 + 19,18,27,26 : clx<0-3>y4 + 23,22,31,30 : clx<0-3>y5 + 17,16,25,24 : clx<0-3>y6 + 21,20,29,28 : clx<0-3>y7 + + 32-43 : jl0Ix<0-3>, jl0Fy<0-7> + 44-51 : jl1Ix<0-3>, jl1Fy<4-7> + 36-39 : jl1Fy<0-3> + + 32-43 ~ swapBuff + + 88-89 : track<0-1> + 90-92 : writeS, pred30, pred36 + + // Image Transform + 44-51 ~ ti<0-5> + + 52,53,54,56,57,55 : i<0-5>0 + 59,60,61,63,58,62 : i<0-5>1 + 66,67,68,64,65,69 : i<0-5>2 + 73,74,75,71,72,70 : i<0-5>3 + 87,82,83,85,86,84 : i<0-5>4 + 80,81,76,78,79,77 : i<0-5>5 + + 52,53,54,56,57,55 : TI<0-5>0 + 59,60,61,63,58,62 : TI<0-5>1 + 66,67,68,64,65,69 : TI<0-5>2 + 73,74,75,71,72,70 : TI<0-5>3 + 87,82,83,85,86,84 : TI<0-5>4 + 80,81,76,78,79,77 : TI<0-5>5 + + 52,53,54,56,57,55 : I<0-5>0 + 59,60,61,63,58,62 : I<0-5>1 + 66,67,68,64,65,69 : I<0-5>2 + 73,74,75,71,72,70 : I<0-5>3 + 87,82,83,85,86,84 : I<0-5>4 + 80,81,76,78,79,77 : I<0-5>5 + + // Filter Transform + 44-47 ~ rcp6, rcp8, rcp12, rcp24 + + 52,53,54 : f<0-2>0 + 55,56,57 : f<0-2>1 + 58,59,60 : f<0-2>2 + + 61,62,63 : tf<0-2>0 + 64,65,66 : tf<0-2>1 + 67,68,69 : tf<0-2>2 + + 70,71,72,73,74,54 : TF<0-5>0 + 76,77,78,79,80,57 : TF<0-5>1 + 82,83,84,85,86,60 : TF<0-5>2 + + 61,64,48,49,50,51 : ff<0-5>0 + 52,53,55,56,58,59 : ff<0-5>1 + 61,64,48,49,50,51 : ff<0-5>2 + + 70,71,72,73,74,54 : F<0-5>0 + 62,63,65,66,67,68 : F<0-5>1 + 52,53,55,56,58,59 : F<0-5>2 + 69,75,81,87,76,77 : F<0-5>3 + 61,64,78,79,80,57 : F<0-5>4 + 82,83,84,85,86,60 : F<0-5>5 + + 32-39 ~ partialC, idx_K, idx_Y, idx_X + 40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, idx_Y2, idx_X2, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, gx, gy, c, kk, offset, sign, idx_N, nn, x<1-5>, mask_x, super_x, super_y, partC + + 32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1 + 48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16 + + + 3, 2,11,10,19,18,27,26 : ccx<0-7>y0 + 7, 6,15,14,23,22,31,30 : ccx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2 + 5, 4,13,12,21,20,29,28 : ccx<0-7>y3 + 35,34,43,42,51,50,59,58 : ccx<0-7>y4 + 39,38,47,46,55,54,63,62 : ccx<0-7>y5 + 33,32,41,40,49,48,57,56 : ccx<0-7>y6 + 37,36,45,44,53,52,61,60 : ccx<0-7>y7 + + 64-79 : jc0Ix<0-7>, jc0Fy<0-7> + 80-91 : jc1Ix<4-7>, jc1Fy<0-7> + 64-67 : jc1Ix<0-3> + + 64-86 ~ tid16, tid_1, tid128 + 92 = swapBuf + + 87 = tid + 93-95 ~ C, readFs, readIs + + 64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q + 86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset + + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + + // t00 80 r00 78 + // t10 m10 r01 w01 + // t20 m20 r02 w02 + // t30 m30 r03 w03 + // w00 m00 s00 w00 + // w30 m40 s01 w01 + // w10 m10 s02 w02 + // w20 m20 s03 w04 + + 78 = t0<0-5>, r<0-3>0 + 79 = temp + + 3, 2,11,10,19,18 : m<0-5>0 + 1, 9, 0, 8,17,16 : m<0-5>1 + 27,26,25,24,64,65 : m<0-5>2 + 2,11,10 : t10, t20, t30 + 9, 0, 8 : t11, t21, t31 + 26,25,24 : t12, t22, t32 + 3, 2,11,19 : w00, w10, w20, w30 + 1, 9, 0,17 : w01, w11, w21, w31 + 27,26,25,64 : w02, w12, w22, w32 + + 66,67,68,69,70,71 : m<0-5>3 + 72,73,74,75,76,77 : m<0-5>4 + 8,24,10,65,16,18 : m<0-5>5 + 67,68,69 : t13, t23, t33 + 73,74,75 : t14, t24, t34 + 24,10,65 : t15, t25, t35 + 66,67,68,70 : w03, w13, w23, w33 + 72,73,74,76 : w04, w14, w24, w34 + 8,24,10,16 : w05, w15, w25, w35 + + 1,27,66 : r01, r02, r03 + 9,26,67 : r11, r12, r13 + 0,25,68 : r21, r22, r23 + 17,64,70 : r31, r32, r33 + 3, 1,27,72 : s00, s01, s02, s03 + 2, 9,26,73 : s10, s11, s12, s13 + 11, 0,25,74 : s20, s21, s22, s23 + 19,17,64,76 : s30, s31, s32, s33 + + 80-83 ~ xx<0-3> + 78-81 ~ sum<0-3> + 82-83 : Sum<0-1> + 84-85 : Out<0-1> + + 8,10,16,18 ~ b0<0-3> + 24,65,66,67 ~ b1<0-3> + 68,69,70,71 ~ b2<0-3> + 75,77,78,79 ~ b3<0-3> + + + +--:-:-:-:0 MOV C, param_C; +--:-:1:-:2 S2R tid, SR_TID.X; +01:-:-:-:0 ISETP.GE.AND P0, PT, tid, 128, PT; +--:-:-:-:1 STS.128 [addr_zero], RZ; +--:-:-:Y:c LOP.AND partialC, C, 1; +--:-:-:-:0 IADD C, C, partialC; +--:-:-:-:5 @P0 BRA.U COMPUTE_SETUP; + +############################################################## +LOAD_SETUP: + +--:-:1:-:1 S2R idx_YXk, SR_CTAID.X; +--:-:2:-:1 S2R idx_K, SR_CTAID.Y; + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +] + +--:-:-:-:1 ISETP.EQ.AND P0, PT, tid, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, tid, 64, PT; + +// idx_Y2 = idx_YXk / blk_Xk +--:-:-:-:1 MOV magic_Xk, param_magic_Xk; +--:-:-:-:1 IADD negXk, RZ, -param_Xk; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_Xk, 1, PT; +01:-:-:-:1 @P3 XMAD div1, idx_YXk, magic_Xk, RZ; +--:-:-:-:1 @P3 XMAD div2, idx_YXk, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, idx_YXk.H1, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk, div1; +--:-:-:-:1 @P3 IADD3.RS idx_Y2, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 idx_Y2, idx_Y2, param_shift_Xk; +--:-:-:-:1 @!P3 SHR.U32 idx_Y2, idx_YXk, param_shift_Xk; + +// idx_Xk = idx_YXk % blk_Xk +--:-:-:-:1 XMAD.LO2 idx_Xk, negXk, idx_Y2, idx_YXk; + +// idx_X2 = idx_Xk / blk_k +// idx_k = idx_Xk % blk_k +--:-:-:-:1 XMAD idx_X2, idx_Xk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_X2, idx_X2, param_shift_k; +--:-:-:-:1 XMAD idx_k, idx_X2, param_k, RZ; +--:-:-:-:1 IADD idx_k, -idx_k, idx_Xk; + +// idx_K = idx_K * blk_k + idx_k +02:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +// gx = x2 +// gy = y2 * 2 +--:-:-:-:1 MOV idx_X, idx_X2; +--:-:-:-:1 SHL idx_Y, idx_Y2, 1; + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// if y2 != Y2: +// gy += (gx&1) ^ ((gx&2)>>1) +// gx /= 2 +--:-:-:-:1 ISETP.NE.AND P4, PT, idx_Y2, param_Y2, PT; +--:-:-:-:1 @P4 LOP.AND x1, idx_X, 1; +--:-:-:-:1 @P4 BFE.U32 x2, idx_X, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P4 LOP.XOR x1, x1, x2; +--:-:-:-:1 @P4 IADD idx_Y, idx_Y, x1; +--:-:-:-:1 @P4 SHR.U32 idx_X, idx_X, 1; + +// Scan backwards on odd rows +// if y2 & 1: +// gx = gridX - gx - 1 +--:-:-:-:1 LOP.AND.NZ P5, RZ, idx_Y2, 1; +--:-:-:-:1 @P5 IADD idx_X, -idx_X, param_GX; +--:-:-:-:1 @P5 IADD idx_X, idx_X, -1; + +--:6:-:-:1 @P0 STS [addr_idx_Y], idx_Y; +--:6:-:-:1 @P0 STS [addr_idx_X], idx_X; +--:6:-:-:1 @P0 STS [addr_idx_K], idx_K; + +// x = gx << shiftX +// y = gy << shiftY +--:-:-:-:1 SHL gx, idx_X, param_shiftX; +--:-:-:-:1 SHL gy, idx_Y, param_shiftY; + +// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp +--:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 super_y, tid, param_superY; +--:-:-:-:1 ISCADD gx, super_x, gx, 2; +--:-:-:-:1 ISCADD gy, super_y, gy, 2; + +--:-:-:-:1 LOP.AND tid32_2, tid, -32; +--:-:-:-:1 SHR.U32 tid32_2, tid32_2, 2; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid32_2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD3 readFs, readFs, tid1, tid32_2; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*36*2>, 4; + +// c = (tid & 32) >> 5 +--:-:-:-:1 BFE.U32 c, tid, 0x105; // 1 bits at position 5 + +// writeS = c*32*36 + tid & 31 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 XMAD writeS, c, 1152, tid31; +--:-:-:-:1 SHL writeS, writeS, 2; + + + +--:-:-:-:5 @P1 BRA.U FILTER_SETUP; + +############################################################## +IMAGE_SETUP: + +--:-:1:-:1 S2R idx_N, SR_CTAID.Z; + + +--:-:-:-:1 STS [writeS + 4x<32*0>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*1>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*2>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*3>], RZ; + +// n = idx_N*32 + tid & maskN +--:-:-:-:1 LOP.AND nn, tid, param_maskN; +01:-:-:-:1 ISCADD nn, idx_N, nn, 5; + +// n < N +--:-:-:-:1 ISETP.LT.AND P6, PT, nn, param_N, PT; + +// Subtract off the padding +--:-:-:-:1 IADD gx, gx, -param_pad_x; +--:-:-:-:1 IADD gy, gy, -param_pad_y; + +// offset = c*YXN + y0*XN + x0*N + n; +--:-:-:-:1 XMAD.S16.U16 offset, gx, param_N, nn; +--:-:-:-:1 XMAD.S16.U16.LO2C offset, gy, param_XN, offset; +--:-:-:-:1 XMAD.S16.U16.LO2C offset, c, param_YXN, offset; +--:-:-:-:1 ISET.LT.AND sign, offset, RZ, PT; + +--:-:-:-:1 LEA track0.CC, offset, param_I[0], [+ dshift() +]; +--:-:-:-:1 IADD.X track1, sign, param_I[1]; + +--:-:-:-:1 IADD x1, gx, 1; +--:-:-:-:1 IADD x2, gx, 2; +--:-:-:-:1 IADD x3, gx, 3; +--:-:-:-:1 IADD x4, gx, 4; +--:-:-:-:1 IADD x5, gx, 5; + +--:-:-:-:1 ISETP.LT.AND P0, PT, gx, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, x4, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, x5, param_W, PT; +--:-:-:-:1 ISETP.GE.AND P0, PT, gx, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, x4, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, x5, RZ, P5; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x3f; + +--:-:-:-:1 IADD x1, gy, 1; +--:-:-:-:1 IADD x2, gy, 2; +--:-:-:-:1 IADD x3, gy, 3; +--:-:-:-:1 IADD x4, gy, 4; +--:-:-:-:1 IADD x5, gy, 5; +--:-:-:-:1 ISETP.LT.AND P0, PT, gy, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P4, PT, x4, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, x5, param_Y, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, gy, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, x4, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, x5, RZ, P5; + +--:-:-:-:1 SEL pred30, mask_x, RZ, P0; +--:-:-:-:1 @P1 BFI pred30, mask_x, 0x606, pred30; +--:-:-:-:1 @P2 BFI pred30, mask_x, 0x60c, pred30; +--:-:-:-:1 @P3 BFI pred30, mask_x, 0x612, pred30; +--:-:-:-:1 @P4 BFI pred30, mask_x, 0x618, pred30; +--:-:-:-:1 SEL pred36, mask_x, RZ, P5; + +// P6 = c == partialC == 1 +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, partialC, P6; +--:-:-:-:1 XMAD partC, partialC, param_YXN, RZ; +--:-:-:-:1 XMAD.PSL partialC, partialC, param_YXN.H1, partC; +--:-:-:-:1 SHL partialC, partialC, [+ dshift() +]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30; +20:-:-:-:1 @!P0 MOV i00, RZ; +--:-:-:-:1 @!P1 MOV i01, RZ; +--:-:-:-:1 @!P2 MOV i02, RZ; +--:-:-:-:1 @!P3 MOV i03, RZ; +--:-:-:-:1 @!P4 MOV i04, RZ; +--:-:-:-:1 @!P5 MOV i05, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i04, [track + [+ dsize() +]x<0*$W*$N + 4*$N>]; +--:-:1:-:1 @P5 LDG.E.CI.[+ dtype() +] i05, [track + [+ dsize() +]x<0*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @!P0 MOV i10, RZ; +--:-:-:-:1 @!P1 MOV i11, RZ; +--:-:-:-:1 @!P2 MOV i12, RZ; +--:-:-:-:1 @!P3 MOV i13, RZ; +--:-:-:-:1 @!P4 MOV i14, RZ; +--:-:-:-:1 @!P5 MOV i15, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i14, [track + [+ dsize() +]x<1*$W*$N + 4*$N>]; +--:-:2:-:1 @P5 LDG.E.CI.[+ dtype() +] i15, [track + [+ dsize() +]x<1*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @!P0 MOV i20, RZ; +--:-:-:-:1 @!P1 MOV i21, RZ; +--:-:-:-:1 @!P2 MOV i22, RZ; +--:-:-:-:1 @!P3 MOV i23, RZ; +--:-:-:-:1 @!P4 MOV i24, RZ; +--:-:-:-:1 @!P5 MOV i25, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i24, [track + [+ dsize() +]x<2*$W*$N + 4*$N>]; +--:-:3:-:1 @P5 LDG.E.CI.[+ dtype() +] i25, [track + [+ dsize() +]x<2*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @!P0 MOV i30, RZ; +--:-:-:-:1 @!P1 MOV i31, RZ; +--:-:-:-:1 @!P2 MOV i32, RZ; +--:-:-:-:1 @!P3 MOV i33, RZ; +--:-:-:-:1 @!P4 MOV i34, RZ; +--:-:-:-:1 @!P5 MOV i35, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i34, [track + [+ dsize() +]x<3*$W*$N + 4*$N>]; +--:-:4:-:1 @P5 LDG.E.CI.[+ dtype() +] i35, [track + [+ dsize() +]x<3*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.L.U64 pred30, pred30, 24, pred30; +--:-:-:-:1 @!P0 MOV i40, RZ; +--:-:-:-:1 @!P1 MOV i41, RZ; +--:-:-:-:1 @!P2 MOV i42, RZ; +--:-:-:-:1 @!P3 MOV i43, RZ; +--:-:-:-:1 @!P4 MOV i44, RZ; +--:-:-:-:1 @!P5 MOV i45, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i40, [track + [+ dsize() +]x<4*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i41, [track + [+ dsize() +]x<4*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i42, [track + [+ dsize() +]x<4*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i43, [track + [+ dsize() +]x<4*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i44, [track + [+ dsize() +]x<4*$W*$N + 4*$N>]; +--:-:5:-:1 @P5 LDG.E.CI.[+ dtype() +] i45, [track + [+ dsize() +]x<4*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred36, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P0 MOV i50, RZ; +--:-:-:-:1 @!P1 MOV i51, RZ; +--:-:-:-:1 @!P2 MOV i52, RZ; +--:-:-:-:1 @!P3 MOV i53, RZ; +--:-:-:-:1 @!P4 MOV i54, RZ; +--:-:-:-:1 @!P5 MOV i55, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i50, [track + [+ dsize() +]x<5*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i51, [track + [+ dsize() +]x<5*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i52, [track + [+ dsize() +]x<5*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i53, [track + [+ dsize() +]x<5*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i54, [track + [+ dsize() +]x<5*$W*$N + 4*$N>]; +--:-:6:-:1 @P5 LDG.E.CI.[+ dtype() +] i55, [track + [+ dsize() +]x<5*$W*$N + 5*$N>]; + + + +--:-:-:-:5 BAR.SYNC 0; + +3f:-:-:-:5 IADD track0.CC, track0, -partialC; +--:-:-:-:1 IADD writeS, writeS, 4x<32*36*2*2>; +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:5 BRA.U IMAGE_LOOP; + +############################################################## +FILTER_SETUP: + + +// writeS += 32*36*2*4 +--:-:-:-:1 IADD writeS, writeS, 4x<32*36*2>; + +--:-:-:-:1 MOV swapBuf, 4x<32*36*2*2>; + +// P6 = c == partialC == 1 +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, partialC, P6; +--:-:-:-:1 XMAD partC, partialC, param_RSK, RZ; +--:-:-:-:1 XMAD.PSL partialC, partialC, param_RSK.H1, partC; +--:-:-:-:1 SHL partialC, partialC, [+ dshift() +]; + +// k = idx_K*32 + tid & 31 +--:-:-:-:1 ISCADD kk, idx_K, tid31, 5; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, !P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, kk, param_K, PT; + +// a0 = c*RSK + k +--:-:-:-:1 XMAD.LO2C offset, c, param_RSK, kk; +--:-:-:-:1 LEA track0.CC, offset, param_F[0], [+ dshift() +]; +--:-:-:-:1 LEA.HI.X track1, offset, param_F[1], RZ, [+ dshift() +]; + +--:-:-:-:1 STS [writeS + 4x<32*0>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*1>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*2>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*3>], RZ; + +20:-:-:-:1 @!P6 MOV f00, RZ; +--:-:-:-:1 @!P6 MOV f01, RZ; +--:-:-:-:1 @!P6 MOV f02, RZ; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f00, [track + [+ dsize() +]x<0*3*$K + 0*$K>]; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f01, [track + [+ dsize() +]x<0*3*$K + 1*$K>]; +--:-:1:-:1 @P6 LDG.E.CI.[+ dtype() +] f02, [track + [+ dsize() +]x<0*3*$K + 2*$K>]; + +--:-:-:-:1 @!P6 MOV f10, RZ; +--:-:-:-:1 @!P6 MOV f11, RZ; +--:-:-:-:1 @!P6 MOV f12, RZ; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f10, [track + [+ dsize() +]x<1*3*$K + 0*$K>]; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f11, [track + [+ dsize() +]x<1*3*$K + 1*$K>]; +--:-:2:-:1 @P6 LDG.E.CI.[+ dtype() +] f12, [track + [+ dsize() +]x<1*3*$K + 2*$K>]; + +--:-:-:-:1 @!P6 MOV f20, RZ; +--:-:-:-:1 @!P6 MOV f21, RZ; +--:-:-:-:1 @!P6 MOV f22, RZ; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f20, [track + [+ dsize() +]x<2*3*$K + 0*$K>]; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f21, [track + [+ dsize() +]x<2*3*$K + 1*$K>]; +--:5:3:-:1 @P6 LDG.E.CI.[+ dtype() +] f22, [track + [+ dsize() +]x<2*3*$K + 2*$K>]; + + + +--:-:-:-:5 BAR.SYNC 0; + +10:-:-:-:4 IADD track0.CC, track0, -partialC; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:5 BRA.U FILTER_LOOP; + +############################################################## + +COMPUTE_SETUP: + + +--:-:-:-:1 MOV swapBuf, 4x<32*36*2*2>; + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +--:-:-:-:1 IADD tid128, tid, -128; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readFs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid128, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readIs, tid128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid16; +--:-:-:-:1 ISCADD readIs, readIs, 4x<32*4 + 32*36*2*2>, 4; + +--:-:-:-:1 LOP.AND tid_1, tid128, 1; +--:-:-:-:1 LOP.AND readFs, tid128, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 IADD3 readFs, readFs, tid16, tid_1; +--:-:-:-:0 ISCADD readFs, readFs, 4x<32*4 + 32*36*2*3>, 4; + + +--:-:-:-:5 BAR.SYNC 0; + +// Let Load loop run once to transform initial load and store to shared. +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>]; +--:-:1:-:2 LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>]; + +COMPUTE_LOOP: +[+ + my %insert = ( + + j0c33 => "--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT;\n" . + "--:-:-:-:1 IADD C, C, -2;\n", + + j0c62 => "02:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j1c63 => "--:-:-:Y:5 \@P0 BRA.U COMPUTE_LOOP;\n" . + "--:-:-:Y:5 BRA.U COMPUTE_FINISH;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 1) + { + my $odd = $j; + my $nOdd = 1 - $j; + my $rsPred = $j == 1 ? '@P0' : ' '; + my $bar = $j == 0 ? '2' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd; + + $insert{"j${j}c31"} = sprintf "--:%s:1:-:1 %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd; + + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + #$stall = '4' if $stall && $c % 2 == 0 && $j == 0 && $c > 16; + + my $yield = $c % 5 == 0 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +IMAGE_LOOP: + +[+ + our $convert_in; return $convert_in ? q{ +01:-:-:-:1 F2F.F32.F16 i00, i00; +--:-:-:-:1 F2F.F32.F16 i01, i01; +--:-:-:-:1 F2F.F32.F16 i02, i02; +--:-:-:-:1 F2F.F32.F16 i03, i03; +--:-:-:-:1 F2F.F32.F16 i04, i04; +--:-:1:-:1 F2F.F32.F16 i05, i05; + +02:-:-:-:1 F2F.F32.F16 i10, i10; +--:-:-:-:1 F2F.F32.F16 i11, i11; +--:-:-:-:1 F2F.F32.F16 i12, i12; +--:-:-:-:1 F2F.F32.F16 i13, i13; +--:-:-:-:1 F2F.F32.F16 i14, i14; +--:-:2:-:1 F2F.F32.F16 i15, i15; + +04:-:-:-:1 F2F.F32.F16 i20, i20; +--:-:-:-:1 F2F.F32.F16 i21, i21; +--:-:-:-:1 F2F.F32.F16 i22, i22; +--:-:-:-:1 F2F.F32.F16 i23, i23; +--:-:-:-:1 F2F.F32.F16 i24, i24; +--:-:3:-:1 F2F.F32.F16 i25, i25; + +08:-:-:-:1 F2F.F32.F16 i30, i30; +--:-:-:-:1 F2F.F32.F16 i31, i31; +--:-:-:-:1 F2F.F32.F16 i32, i32; +--:-:-:-:1 F2F.F32.F16 i33, i33; +--:-:-:-:1 F2F.F32.F16 i34, i34; +--:-:4:-:1 F2F.F32.F16 i35, i35; + +10:-:-:-:1 F2F.F32.F16 i40, i40; +--:-:-:-:1 F2F.F32.F16 i41, i41; +--:-:-:-:1 F2F.F32.F16 i42, i42; +--:-:-:-:1 F2F.F32.F16 i43, i43; +--:-:-:-:1 F2F.F32.F16 i44, i44; +--:-:5:-:1 F2F.F32.F16 i45, i45; + +20:-:-:-:1 F2F.F32.F16 i50, i50; +--:-:-:-:1 F2F.F32.F16 i51, i51; +--:-:-:-:1 F2F.F32.F16 i52, i52; +--:-:-:-:1 F2F.F32.F16 i53, i53; +--:-:-:-:1 F2F.F32.F16 i54, i54; +--:-:6:-:2 F2F.F32.F16 i55, i55; + } : ''; ++] + +[+ + my $out; + foreach my $i (0 .. 5) + { + my $w = $i == 0 ? '3f' : '--'; + $out .= qq{ +$w:-:-:-:1 FFMA ti4, i2$i, -2.640625, i4$i; +--:-:-:-:1 FFMA ti5, i3$i, -2.640625, i5$i; +--:-:-:-:1 FFMA ti0, i2$i, -2.25, i4$i; +--:-:-:-:1 FFMA ti1, i1$i, -2.25, i3$i; +--:-:-:-:1 FFMA ti2, i2$i, -0.390625, i4$i; +--:-:-:-:1 FFMA ti3, i1$i, -0.390625, i3$i; +--:-:-:-:1 FFMA TI0$i, i0$i, 0.87890625, ti4; +--:-:-:-:1 FFMA TI5$i, i1$i, 0.87890625, ti5; +--:-:-:-:1 FFMA TI1$i, ti1, 0.625, ti0; +--:-:-:-:1 FFMA TI2$i, ti1, -0.625, ti0; +--:-:-:-:1 FFMA TI3$i, ti3, 1.5, ti2; +--:-:-:-:1 FFMA TI4$i, ti3, -1.5, ti2; + }; + } + return $out; ++] + +--:-:-:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; + +--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT; + +--:-:-:-:1 IADD track0.CC, track0, param_YXN2p; +--:-:-:-:1 IADD.X track1, track1, RZ; + +//--:-:-:-:1 LOP32I.AND pred30, pred30, 0xffffff; +--:-:-:-:1 @!P0 BFI pred36, RZ, 0x600, pred36; +--:-:-:-:1 @!P0 MOV pred30, RZ; + +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; + +[+ + my $out; + foreach my $i (0 .. 5) + { + $out .= qq{ +--:-:-:-:1 FFMA ti4, TI${i}2, -2.640625, TI${i}4; +--:-:-:-:1 FFMA ti5, TI${i}3, -2.640625, TI${i}5; +--:-:-:-:1 FFMA ti0, TI${i}2, -2.25, TI${i}4; +--:-:-:-:1 FFMA ti1, TI${i}1, -2.25, TI${i}3; +--:-:-:-:1 FFMA ti2, TI${i}2, -0.390625, TI${i}4; +--:-:-:-:1 FFMA ti3, TI${i}1, -0.390625, TI${i}3; +--:-:-:-:1 FFMA I${i}0, TI${i}0, 0.87890625, ti4; +--:-:-:-:1 FFMA I${i}5, TI${i}1, 0.87890625, ti5; +--:-:-:-:1 FFMA I${i}1, ti1, 0.625, ti0; +--:-:-:-:1 FFMA I${i}2, ti1, -0.625, ti0; +--:-:-:-:1 FFMA I${i}3, ti3, 1.5, ti2; +--:-:-:-:1 FFMA I${i}4, ti3, -1.5, ti2; + }; + } + return $out; ++] + +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 3)>], I03; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 4)>], I04; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 0)>], I00; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 5)>], I05; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 1)>], I01; +--:1:-:-:1 STS [writeS + 4x<32*(0*6 + 2)>], I02; + + +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 0)>], I10; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 5)>], I15; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 3)>], I13; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 1)>], I11; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 2)>], I12; +--:2:-:-:1 STS [writeS + 4x<32*(1*6 + 4)>], I14; + +01:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i04, [track + [+ dsize() +]x<0*$W*$N + 4*$N>]; +--:-:1:-:1 @P5 LDG.E.CI.[+ dtype() +] i05, [track + [+ dsize() +]x<0*$W*$N + 5*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i00, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i01, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i02, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i03, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i04, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i05, RZ; +--:-:-:-:1 R2P PR, pred30, 0x3f; + +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 0)>], I20; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 5)>], I25; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 1)>], I21; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 2)>], I22; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 3)>], I23; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:3:-:-:1 STS [writeS + 4x<32*(2*6 + 4)>], I24; + +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 0)>], I30; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 5)>], I35; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 1)>], I31; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 2)>], I32; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 3)>], I33; +--:4:-:-:1 STS [writeS + 4x<32*(3*6 + 4)>], I34; + +02:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i14, [track + [+ dsize() +]x<1*$W*$N + 4*$N>]; +--:-:2:-:1 @P5 LDG.E.CI.[+ dtype() +] i15, [track + [+ dsize() +]x<1*$W*$N + 5*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i10, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i11, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i12, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i13, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i14, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i15, RZ; + +--:-:-:-:5 R2P PR, pred30, 0x3f; // FORCE + +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 0)>], I40; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 5)>], I45; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 1)>], I41; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 2)>], I42; + + + + + +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 3)>], I43; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 4)>], I44; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; + +--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>]; +--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>]; + +04:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i24, [track + [+ dsize() +]x<2*$W*$N + 4*$N>]; +--:-:3:-:1 @P5 LDG.E.CI.[+ dtype() +] i25, [track + [+ dsize() +]x<2*$W*$N + 5*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i20, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i21, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i22, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i23, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i24, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i25, RZ; +--:-:-:-:6 R2P PR, pred30, 0x3f; // FORCE + +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 0)>], I50; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 5)>], I55; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 1)>], I51; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 2)>], I52; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 3)>], I53; +--:6:-:-:1 STS [writeS + 4x<32*(5*6 + 4)>], I54; + +08:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i34, [track + [+ dsize() +]x<3*$W*$N + 4*$N>]; +--:-:4:-:1 @P5 LDG.E.CI.[+ dtype() +] i35, [track + [+ dsize() +]x<3*$W*$N + 5*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i30, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i31, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i32, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i33, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i34, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i35, RZ; +--:-:-:-:c R2P PR, pred30, 0x3f; // FORCE + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i40, [track + [+ dsize() +]x<4*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i41, [track + [+ dsize() +]x<4*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i42, [track + [+ dsize() +]x<4*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i43, [track + [+ dsize() +]x<4*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i44, [track + [+ dsize() +]x<4*$W*$N + 4*$N>]; +--:-:5:-:1 @P5 LDG.E.CI.[+ dtype() +] i45, [track + [+ dsize() +]x<4*$W*$N + 5*$N>]; +--:-:-:-:1 SHF.L.U64 pred30, pred30, 24, pred30; + +--:-:-:-:1 @!P0 I2I.U32.U32 i40, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i41, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i42, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i43, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i44, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i45, RZ; +--:-:-:-:a R2P PR, pred36, 0x3f; // FORCE + +20:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i50, [track + [+ dsize() +]x<5*$W*$N + 0*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i50, RZ; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i51, [track + [+ dsize() +]x<5*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i52, [track + [+ dsize() +]x<5*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i53, [track + [+ dsize() +]x<5*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i54, [track + [+ dsize() +]x<5*$W*$N + 4*$N>]; +--:-:6:-:1 @P5 LDG.E.CI.[+ dtype() +] i55, [track + [+ dsize() +]x<5*$W*$N + 5*$N>]; +--:-:-:-:1 @!P1 I2I.U32.U32 i51, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i52, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i53, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i54, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i55, RZ; + + + +[+ + our ($vsize, $dsize, $convert_in); + my %insert = ( + j0c15 => "--:-:5:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 && $j == 1 ? "10" : '--'; + + my $ctrl = "$wait:-:-:-:1"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + +--:-:-:-:1 LOP.AND.Z P0, RZ, pred36, 0x100; +--:-:-:-:1 LOP.XOR pred36, pred36, 0x100; + +--:-:-:-:1 ISETP.GT.AND P1, PT, C, 0, PT; + +--:-:-:-:1 @P0 MOV32I swapBuff, 4x<32*36*2*2>; + + +--:-:-:-:1 @!P0 MOV32I swapBuff, -4x<32*36*2*2>; +--:-:-:-:0 IADD C, C, -2; +--:-:-:Y:5 BAR.SYNC 0; +--:-:-:-:1 IADD readFs, readFs, swapBuff; +--:-:-:-:1 IADD readIs, readIs, swapBuff; +--:-:-:-:1 IADD writeS, writeS, -swapBuff; +--:-:-:Y:5 @P1 BRA.U IMAGE_LOOP; +--:-:-:Y:5 BRA.U LOAD_FINISH; + +FILTER_LOOP: + +[+ + our $convert_in; return $convert_in ? q{ +01:-:-:-:1 F2F.F32.F16 f00, f00; +--:-:-:-:1 F2F.F32.F16 f01, f01; +--:-:1:-:1 F2F.F32.F16 f02, f02; + +02:-:-:-:1 F2F.F32.F16 f10, f10; +--:-:-:-:1 F2F.F32.F16 f11, f11; +--:-:2:-:1 F2F.F32.F16 f12, f12; + +04:-:-:-:1 F2F.F32.F16 f20, f20; +--:-:-:-:1 F2F.F32.F16 f21, f21; +--:-:3:-:1 F2F.F32.F16 f22, f22; + } : ''; ++] + +--:-:-:-:1 MOV32I rcp6, 0.688403361344538; +--:-:-:-:1 MOV32I rcp8, 0.430252100840336; +--:-:-:-:1 MOV32I rcp24, 0.119514472455649; +--:-:-:-:1 MOV32I rcp12, 0.179271708683473; +07:-:-:-:1 FMUL32I tf00, f20, 0.26890756302521; +--:-:-:-:1 FMUL32I tf01, f21, 0.26890756302521; +--:-:-:-:1 FMUL32I tf02, f22, 0.26890756302521; +--:-:-:-:1 FFMA tf10, f00, -rcp6, -tf00; +--:-:-:-:1 FFMA tf20, f00, rcp24, tf00; +--:-:-:-:1 FFMA tf11, f01, -rcp6, -tf01; +--:-:-:-:1 FFMA tf21, f01, rcp24, tf01; +--:-:-:-:1 FFMA tf12, f02, -rcp6, -tf02; +--:-:-:-:1 FFMA tf22, f02, rcp24, tf02; + +--:-:-:-:1 FMUL32I TF00, f00, 1.13777777777778; +--:-:-:-:1 FFMA TF10, f10, -rcp8, tf10; +--:-:-:-:1 FFMA TF20, f10, rcp8, tf10; +--:-:-:-:1 FFMA TF30, f10, rcp12, tf20; +--:-:-:-:1 FFMA TF40, f10, -rcp12, tf20; +//--:-:-:-:1 MOV TF50, f20; + +--:-:-:-:1 FMUL32I TF02, f02, 1.13777777777778; +--:-:-:-:1 FFMA TF12, f12, -rcp8, tf12; +--:-:-:-:1 FFMA TF22, f12, rcp8, tf12; +--:-:-:-:1 FFMA TF32, f12, rcp12, tf22; +--:-:-:-:1 FFMA TF42, f12, -rcp12, tf22; +//--:-:-:-:1 MOV TF52, f22; + +--:-:-:-:1 FMUL32I TF01, f01, 1.13777777777778; +--:-:-:-:1 FFMA TF11, f11, -rcp8, tf11; +--:-:-:-:1 FFMA TF21, f11, rcp8, tf11; +--:-:-:-:1 FFMA TF31, f11, rcp12, tf21; +--:-:-:-:1 FFMA TF41, f11, -rcp12, tf21; +//--:-:-:-:1 MOV TF51, f21; + +--:-:-:-:1 FMUL32I ff00, TF02, 0.26890756302521; +--:-:-:-:1 FMUL32I ff10, TF12, 0.26890756302521; +--:-:-:-:1 FMUL32I ff20, TF22, 0.26890756302521; +--:-:-:-:1 FMUL32I ff30, TF32, 0.26890756302521; +--:-:-:-:1 FMUL32I ff40, TF42, 0.26890756302521; +--:-:-:-:1 FMUL32I ff50, TF52, 0.26890756302521; +--:-:-:-:1 FFMA ff01, TF00, -rcp6, -ff00; +--:-:-:-:1 FFMA ff02, TF00, rcp24, ff00; +--:-:-:-:1 FFMA ff11, TF10, -rcp6, -ff10; +--:-:-:-:1 FFMA ff12, TF10, rcp24, ff10; +--:-:-:-:1 FFMA ff21, TF20, -rcp6, -ff20; +--:-:-:-:1 FFMA ff22, TF20, rcp24, ff20; +--:-:-:-:1 FFMA ff31, TF30, -rcp6, -ff30; +--:-:-:-:1 FFMA ff32, TF30, rcp24, ff30; +--:-:-:-:1 FFMA ff41, TF40, -rcp6, -ff40; +--:-:-:-:1 FFMA ff42, TF40, rcp24, ff40; +--:-:-:-:1 FFMA ff51, TF50, -rcp6, -ff50; +--:-:-:-:1 FFMA ff52, TF50, rcp24, ff50; + +--:-:-:-:1 FMUL32I F00, TF00, 1.13777777777778; +--:-:-:-:1 FFMA F01, TF01, -rcp8, ff01; +--:-:-:-:1 FFMA F02, TF01, rcp8, ff01; +--:-:-:-:1 FFMA F03, TF01, rcp12, ff02; +--:-:-:-:1 FFMA F04, TF01, -rcp12, ff02; +//--:-:-:-:1 MOV F05, TF02; + +--:-:-:-:1 FMUL32I F10, TF10, 1.13777777777778; +--:-:-:-:1 FFMA F11, TF11, -rcp8, ff11; +--:-:-:-:1 FFMA F12, TF11, rcp8, ff11; +--:-:-:-:1 FFMA F13, TF11, rcp12, ff12; +--:-:-:-:1 FFMA F14, TF11, -rcp12, ff12; +//--:-:-:-:1 MOV F15, TF12; + +--:-:-:-:1 FMUL32I F20, TF20, 1.13777777777778; +--:-:-:-:1 FFMA F21, TF21, -rcp8, ff21; +--:-:-:-:1 FFMA F22, TF21, rcp8, ff21; +--:-:-:-:1 FFMA F23, TF21, rcp12, ff22; +--:-:-:-:1 FFMA F24, TF21, -rcp12, ff22; +//--:-:-:-:1 MOV F25, TF22; + +--:-:-:-:1 FMUL32I F30, TF30, 1.13777777777778; +--:-:-:-:1 FFMA F31, TF31, -rcp8, ff31; +--:-:-:-:1 FFMA F32, TF31, rcp8, ff31; +--:-:-:-:1 FFMA F33, TF31, rcp12, ff32; +--:-:-:-:1 FFMA F34, TF31, -rcp12, ff32; +//--:-:-:-:1 MOV F35, TF32; + +--:-:-:-:1 FMUL32I F40, TF40, 1.13777777777778; +--:-:-:-:1 FFMA F41, TF41, -rcp8, ff41; +--:-:-:-:1 FFMA F42, TF41, rcp8, ff41; +--:-:-:-:1 FFMA F43, TF41, rcp12, ff42; +--:-:-:-:1 FFMA F44, TF41, -rcp12, ff42; +//--:-:-:-:1 MOV F45, TF42; + +--:-:-:-:1 FMUL32I F50, TF50, 1.13777777777778; +--:-:-:-:1 FFMA F51, TF51, -rcp8, ff51; +--:-:-:-:1 FFMA F52, TF51, rcp8, ff51; +--:-:-:-:1 FFMA F53, TF51, rcp12, ff52; +--:-:-:-:1 FFMA F54, TF51, -rcp12, ff52; +//--:-:-:-:1 MOV F55, TF52; + +--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, P2; +--:-:-:-:1 ISETP.GT.AND P1, PT, C, 0, PT; +--:-:-:-:1 IADD C, C, -2; + +--:-:-:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:6:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; + +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 5)>], F55; + +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 0)>], F00; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 1)>], F01; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 2)>], F02; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 3)>], F03; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 4)>], F04; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 5)>], F05; + +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 0)>], F10; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 1)>], F11; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 2)>], F12; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 3)>], F13; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 4)>], F14; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 5)>], F15; + +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 0)>], F20; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 1)>], F21; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 2)>], F22; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 3)>], F23; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 4)>], F24; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 5)>], F25; + +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 0)>], F30; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 1)>], F31; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 2)>], F32; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 3)>], F33; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 4)>], F34; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 5)>], F35; + +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 0)>], F40; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 1)>], F41; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 2)>], F42; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 3)>], F43; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 4)>], F44; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 5)>], F45; + + + +20:-:-:-:1 IADD track0.CC, track0, param_RSK2p; +--:-:-:-:1 IADD.X track1, track1, RZ; + +[+ + our ($dtype, $dsize, $SK, $K); + my %insert = ( + j0c0 => "--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n", + j0c1 => "--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n", + j0c15 => "--:-:5:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + + j0c5 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 0)>], F50;\n", + j0c7 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 1)>], F51;\n", + j0c9 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 2)>], F52;\n", + j0c11 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 3)>], F53;\n", + j0c13 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 4)>], F54;\n", + + j1c1 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n", + j1c2 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n", + j1c3 => "--:-:1:-:1 \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n", + + j1c4 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n", + j1c5 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n", + j1c6 => "--:-:2:-:1 \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n", + + j1c7 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n", + j1c8 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n", + j1c9 => "--:-:3:-:1 \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 && $j == 1 ? "10" : '--'; + + my $ctrl = "$wait:-:-:-:1"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + + + +--:-:-:-:1 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 IADD readIs, readIs, -swapBuf; +--:-:-:-:0 IADD writeS, writeS, swapBuf; +--:-:-:Y:5 BAR.SYNC 0; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; +--:-:-:Y:5 @P1 BRA.U FILTER_LOOP; + + +LOAD_FINISH: + +[- + our $trans1 = "0.244140625"; + our $trans2 = "0.625"; + our $trans3 = "0.390625"; +-] + + diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_X.sass b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_X.sass new file mode 100644 index 0000000..15a0f0b --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_X.sass @@ -0,0 +1,687 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $type; +our $dtype = $type eq 'h' ? 'U16' : '32'; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $dshift = $type eq 'h' ? '1' : '2'; +our $dsize = $type eq 'h' ? '2' : '4'; +our $vsize = $type eq 'h' ? '64' : '128'; +sub dtype { return $dtype; } +sub dsize { return $dsize; } +sub dshift { return $dshift; } +sub vsize { return $vsize; } +-] + + + + addr_zero : 4x<32*36*2*4 + 64 + 0> + addr_idx_Y : 4x<32*36*2*4 + 64 + 4> + addr_idx_X : 4x<32*36*2*4 + 64 + 5> + addr_idx_K : 4x<32*36*2*4 + 64 + 6> + + param_S[0] : c[0x0][0x140] + param_S[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_C : c[0x0][0x174] + param_K : c[0x0][0x178] + param_N : c[0x0][0x17c] + param_Xk : c[0x0][0x180] + param_k : c[0x0][0x184] + param_magic_Xk : c[0x0][0x188] + param_shift_Xk : c[0x0][0x18c] + param_magic_k : c[0x0][0x190] + param_shift_k : c[0x0][0x194] + param_C_1152 : c[0x0][0x198] + param_GXS_C_1152 : c[0x0][0x19c] + param_GYS_GXS_C_1152 : c[0x0][0x1a0] + param_P : c[0x0][0x1a4] + param_Q : c[0x0][0x1a8] + param_QN : c[0x0][0x1ac] + param_PQN : c[0x0][0x1b0] + param_PQN15 : c[0x0][0x1b4] + param_maskN : c[0x0][0x1b8] + param_shiftX : c[0x0][0x1bc] + param_shiftY : c[0x0][0x1c0] + param_superX : c[0x0][0x1c4] + param_superY : c[0x0][0x1c8] + param_gridN : c[0x0][0x1cc] + param_gridQN : c[0x0][0x1d0] + param_gridPQN : c[0x0][0x1d4] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10 : clx<0-3>y0 + 7, 6,15,14 : clx<0-3>y1 + 1, 0, 9, 8 : clx<0-3>y2 + 5, 4,13,12 : clx<0-3>y3 + 19,18,27,26 : clx<0-3>y4 + 23,22,31,30 : clx<0-3>y5 + 17,16,25,24 : clx<0-3>y6 + 21,20,29,28 : clx<0-3>y7 + + 32-43 : jl0Ix<0-3>, jl0Fy<0-7> + 44-51 : jl1Ix<0-3>, jl1Fy<4-7> + 36-39 : jl1Fy<0-3> + + 52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3> + 88-89 : track<0-1> + 90-91 ~ writeS + + 32-39 ~ partialC, idx_K, idx_Y, idx_X + 40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, c, offset, idx_N + + 32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1 + 48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16 + + + 3, 2,11,10,19,18,27,26 : ccx<0-7>y0 + 7, 6,15,14,23,22,31,30 : ccx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2 + 5, 4,13,12,21,20,29,28 : ccx<0-7>y3 + 35,34,43,42,51,50,59,58 : ccx<0-7>y4 + 39,38,47,46,55,54,63,62 : ccx<0-7>y5 + 33,32,41,40,49,48,57,56 : ccx<0-7>y6 + 37,36,45,44,53,52,61,60 : ccx<0-7>y7 + + 64-79 : jc0Ix<0-7>, jc0Fy<0-7> + 80-91 : jc1Ix<4-7>, jc1Fy<0-7> + 64-67 : jc1Ix<0-3> + + 64-86 ~ tid16, tid_1, tid128 + + 87 = tid + 92-95 ~ C, swapBuf, readFs, readIs + + 64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q + 86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset + + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + + // t00 80 r00 78 + // t10 m10 r01 w01 + // t20 m20 r02 w02 + // t30 m30 r03 w03 + // w00 m00 s00 w00 + // w30 m40 s01 w01 + // w10 m10 s02 w02 + // w20 m20 s03 w04 + + 78 = t0<0-5>, r<0-3>0 + 79 = temp + + 3, 2,11,10,19,18 : m<0-5>0 + 1, 9, 0, 8,17,16 : m<0-5>1 + 27,26,25,24,64,65 : m<0-5>2 + 2,11,10 : t10, t20, t30 + 9, 0, 8 : t11, t21, t31 + 26,25,24 : t12, t22, t32 + 3, 2,11,19 : w00, w10, w20, w30 + 1, 9, 0,17 : w01, w11, w21, w31 + 27,26,25,64 : w02, w12, w22, w32 + + 66,67,68,69,70,71 : m<0-5>3 + 72,73,74,75,76,77 : m<0-5>4 + 8,24,10,65,16,18 : m<0-5>5 + 67,68,69 : t13, t23, t33 + 73,74,75 : t14, t24, t34 + 24,10,65 : t15, t25, t35 + 66,67,68,70 : w03, w13, w23, w33 + 72,73,74,76 : w04, w14, w24, w34 + 8,24,10,16 : w05, w15, w25, w35 + + 1,27,66 : r01, r02, r03 + 9,26,67 : r11, r12, r13 + 0,25,68 : r21, r22, r23 + 17,64,70 : r31, r32, r33 + 3, 1,27,72 : s00, s01, s02, s03 + 2, 9,26,73 : s10, s11, s12, s13 + 11, 0,25,74 : s20, s21, s22, s23 + 19,17,64,76 : s30, s31, s32, s33 + + 80-83 ~ xx<0-3> + 78-81 ~ sum<0-3> + 82-83 : Sum<0-1> + 84-85 : Out<0-1> + + 8,10,16,18 ~ b0<0-3> + 24,65,66,67 ~ b1<0-3> + 68,69,70,71 ~ b2<0-3> + 75,77,78,79 ~ b3<0-3> + + + +--:-:-:-:0 MOV C, param_C; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:-:-:1 MOV swapBuf, 4x<32*36*2*2>; +01:-:-:-:0 ISETP.GE.AND P0, PT, tid, 128, PT; +--:-:-:-:1 STS.128 [addr_zero], RZ; +--:-:-:Y:c LOP.AND partialC, C, 1; +--:-:-:-:0 IADD C, C, partialC; +--:-:-:-:5 @P0 BRA.U COMPUTE_SETUP; + +############################################################## +LOAD_SETUP: + +--:-:1:-:1 S2R idx_YXk, SR_CTAID.X; +--:-:2:-:1 S2R idx_K, SR_CTAID.Y; + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +] + +--:-:-:-:1 ISETP.EQ.AND P0, PT, tid, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, tid, 64, PT; + +// idx_Y = idx_YXk / blk_Xk +--:-:-:-:1 MOV magic_Xk, param_magic_Xk; +--:-:-:-:1 IADD negXk, RZ, -param_Xk; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_Xk, 1, PT; +01:-:-:-:1 @P3 XMAD div1, idx_YXk, magic_Xk, RZ; +--:-:-:-:1 @P3 XMAD div2, idx_YXk, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, idx_YXk.H1, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk, div1; +--:-:-:-:1 @P3 IADD3.RS idx_Y, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 idx_Y, idx_Y, param_shift_Xk; +--:-:-:-:1 @!P3 SHR.U32 idx_Y, idx_YXk, param_shift_Xk; + +// idx_Xk = idx_YXk % blk_Xk +--:-:-:-:1 XMAD.LO2 idx_Xk, negXk, idx_Y, idx_YXk; + +// idx_X = idx_Xk / blk_k +// idx_k = idx_Xk % blk_k +--:-:-:-:1 XMAD idx_X, idx_Xk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_X, idx_X, param_shift_k; +--:-:-:-:1 XMAD idx_k, idx_X, param_k, RZ; +--:-:-:-:1 IADD idx_k, -idx_k, idx_Xk; + +// idx_K = idx_K * blk_k + idx_k +02:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +--:-:-:-:1 @P0 STS [addr_idx_Y], idx_Y; +--:-:-:-:1 @P0 STS [addr_idx_X], idx_X; +--:-:-:-:1 @P0 STS [addr_idx_K], idx_K; + + +--:-:-:-:1 LOP.AND tid32_2, tid, -32; +--:-:-:-:1 SHR.U32 tid32_2, tid32_2, 2; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid32_2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD3 readFs, readFs, tid1, tid32_2; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*36*2>, 4; + +// c = (tid & 63) >> 5 +--:-:-:-:1 BFE.U32 c, tid, 0x105; // 2 bits at position 5 + +// partialC = (2 - partialC) +// P6 = c < partialC +// partialC *= 32*36 * itemsize +--:-:-:-:1 IADD partialC, -partialC, 2; +--:-:-:-:1 ISETP.LT.AND P6, PT, c, partialC, PT; +--:-:-:-:1 XMAD partialC, partialC, 1x<32*36 * $dsize>, RZ; + +// writeS = (c*32*36 + (tid & 31)*4)*4 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL writeS, tid31, 4; +--:-:-:-:1 XMAD writeS, c, 4x<32*36>, writeS; + +// offset = c*32*36 + tid31*4 +--:-:-:-:1 SHL tid31, tid31, 2; +--:-:-:-:1 XMAD offset, c, 1x<32*36>, tid31; + + +// P5 = C > 2 +--:-:-:-:1 ISETP.GT.AND P5, PT, C, 2, PT; + + +--:-:-:-:5 @P1 BRA.U FILTER_SETUP; + +############################################################## +IMAGE_SETUP: + +--:-:1:-:1 S2R idx_N, SR_CTAID.Z; + +// (GN,GYS,GXS,C,6,6,32) +// offset += (idx_N*GYS*GXS*C*32*36 + idx_Y*GXS*C*32*36 + idx_X*C*32*36) * itemsize; +--:-:-:-:1 XMAD.LO2C offset, idx_X, param_C_1152, offset; +--:-:-:-:1 XMAD.LO2C offset, idx_Y, param_GXS_C_1152, offset; +01:-:-:-:1 XMAD.LO2C offset, idx_N, param_GYS_GXS_C_1152, offset; +--:-:-:-:1 LEA track0.CC, offset, param_I[0], [+ dshift() +]; +--:-:-:-:0 LEA.HI.X track1, offset, param_I[1], RZ, [+ dshift() +]; + + +--:-:-:-:5 BRA.U LOAD; + +############################################################## +FILTER_SETUP: + + +// writeS += 32*36*2*4 +--:-:-:-:1 IADD writeS, writeS, 4x<32*36*2>; + +// (kBlks,C,6,6,32) +// offset += (idx_K*C*32*36) * itemsize; +--:-:-:-:1 XMAD.LO2C offset, idx_K, param_C_1152, offset; +--:-:-:-:1 LEA track0.CC, offset, param_F[0], [+ dshift() +]; +--:-:-:-:2 LEA.HI.X track1, offset, param_F[1], RZ, [+ dshift() +]; + + +############################################################## +LOAD: + +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>]; +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>]; +--:-:2:-:1 @P6 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T0, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T1, [addr_zero]; +--:-:2:-:1 @!P6 LDS.U.[+ vsize() +] T2, [addr_zero]; + +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>]; +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>]; +--:-:3:-:1 @P6 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T3, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T4, [addr_zero]; +--:-:3:-:1 @!P6 LDS.U.[+ vsize() +] T5, [addr_zero]; + +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>]; +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>]; +--:-:4:-:1 @P6 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T6, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T7, [addr_zero]; +--:-:4:-:1 @!P6 LDS.U.[+ vsize() +] T8, [addr_zero]; + +[+ + our $convert_in; + return $convert_in ? q{ + +02:-:-:-:1 F2F.F32.F16 T03, T01.H1; +--:-:-:-:1 F2F.F32.F16 T02, T01.H0; +--:-:-:-:1 F2F.F32.F16 T01, T00.H1; +--:-:2:-:1 F2F.F32.F16 T00, T00.H0; + +--:-:-:-:1 F2F.F32.F16 T13, T11.H1; +--:-:-:-:1 F2F.F32.F16 T12, T11.H0; +--:-:-:-:1 F2F.F32.F16 T11, T10.H1; +--:-:5:-:1 F2F.F32.F16 T10, T10.H0; + +--:-:-:-:1 F2F.F32.F16 T23, T21.H1; +--:-:-:-:1 F2F.F32.F16 T22, T21.H0; +--:-:-:-:1 F2F.F32.F16 T21, T20.H1; +--:-:6:-:1 F2F.F32.F16 T20, T20.H0; + +02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0; + +04:-:-:-:1 F2F.F32.F16 T33, T31.H1; +--:-:-:-:1 F2F.F32.F16 T32, T31.H0; +--:-:-:-:1 F2F.F32.F16 T31, T30.H1; +--:-:3:-:1 F2F.F32.F16 T30, T30.H0; + +10:-:-:-:1 STS.128 [writeS + 4x<1*32*4>], T1; + +--:-:-:-:1 F2F.F32.F16 T43, T41.H1; +--:-:-:-:1 F2F.F32.F16 T42, T41.H0; +--:-:-:-:1 F2F.F32.F16 T41, T40.H1; +--:-:5:-:1 F2F.F32.F16 T40, T40.H0; + +20:-:-:-:1 STS.128 [writeS + 4x<2*32*4>], T2; + +--:-:-:-:1 F2F.F32.F16 T53, T51.H1; +--:-:-:-:1 F2F.F32.F16 T52, T51.H0; +--:-:-:-:1 F2F.F32.F16 T51, T50.H1; +--:-:6:-:1 F2F.F32.F16 T50, T50.H0; + +04:-:-:-:1 STS.128 [writeS + 4x<3*32*4>], T3; + +08:-:-:-:1 F2F.F32.F16 T63, T61.H1; +--:-:-:-:1 F2F.F32.F16 T62, T61.H0; +--:-:-:-:1 F2F.F32.F16 T61, T60.H1; +--:-:4:-:1 F2F.F32.F16 T60, T60.H0; + +10:-:-:-:1 STS.128 [writeS + 4x<4*32*4>], T4; + +--:-:-:-:1 F2F.F32.F16 T73, T71.H1; +--:-:-:-:1 F2F.F32.F16 T72, T71.H0; +--:-:-:-:1 F2F.F32.F16 T71, T70.H1; +--:-:5:-:1 F2F.F32.F16 T70, T70.H0; + +20:-:-:-:1 STS.128 [writeS + 4x<5*32*4>], T5; + +--:-:-:-:1 F2F.F32.F16 T83, T81.H1; +--:-:-:-:1 F2F.F32.F16 T82, T81.H0; +--:-:-:-:1 F2F.F32.F16 T81, T80.H1; +--:-:6:-:1 F2F.F32.F16 T80, T80.H0; + +08:-:-:-:1 STS.128 [writeS + 4x<6*32*4>], T6; +10:-:-:-:1 STS.128 [writeS + 4x<7*32*4>], T7; +20:-:-:-:1 STS.128 [writeS + 4x<8*32*4>], T8; + + } : q{ +02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0; +--:-:-:-:1 STS.128 [writeS + 4x<1*32*4>], T1; +--:-:-:-:1 STS.128 [writeS + 4x<2*32*4>], T2; +04:-:-:-:1 STS.128 [writeS + 4x<3*32*4>], T3; +--:-:-:-:1 STS.128 [writeS + 4x<4*32*4>], T4; +--:-:-:-:1 STS.128 [writeS + 4x<5*32*4>], T5; +08:-:-:-:1 STS.128 [writeS + 4x<6*32*4>], T6; +--:-:-:-:1 STS.128 [writeS + 4x<7*32*4>], T7; +--:-:-:-:1 STS.128 [writeS + 4x<8*32*4>], T8; + }; ++] + +--:-:-:-:0 IADD track0.CC, track0, partialC; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X track1, track1, RZ; + +--:-:-:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:1:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; + +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>]; +--:-:2:-:1 @P5 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>]; +--:-:3:-:1 @P5 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>]; +--:6:4:-:1 @P5 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>]; + +--:-:-:-:5 BRA.U LOAD_LOOP; + +############################################################## + +COMPUTE_SETUP: + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +--:-:-:-:1 IADD tid128, tid, -128; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readFs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid128, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readIs, tid128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid16; +--:-:-:-:1 ISCADD readIs, readIs, 4x<32*4>, 4; + +--:-:-:-:1 LOP.AND tid_1, tid128, 1; +--:-:-:-:1 LOP.AND readFs, tid128, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 IADD3 readFs, readFs, tid16, tid_1; +--:-:-:-:0 ISCADD readFs, readFs, 4x<32*4 + 32*36*2>, 4; + + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>]; +--:-:1:-:2 LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>]; + +COMPUTE_LOOP: +[+ + my %insert = ( + + j0c33 => "--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT;\n" . + "--:-:-:-:1 IADD C, C, -2;\n", + + j0c62 => "02:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readFs, readFs, swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j1c63 => "--:-:-:Y:5 \@P0 BRA.U COMPUTE_LOOP;\n" . + "--:-:-:Y:5 BRA.U COMPUTE_FINISH;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 1) + { + my $odd = $j; + my $nOdd = 1 - $j; + my $rsPred = $j == 1 ? '@P0' : ' '; + my $bar = $j == 0 ? '2' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd; + + $insert{"j${j}c31"} = sprintf "--:%s:1:-:1 %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd; + + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $yield = $c % 10 == 0 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +LOAD_LOOP: +--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT; +20:-:-:-:1 IADD track0.CC, track0, 1x<32*36*2 * $dsize>; +--:-:-:-:1 ISETP.GT.AND P1, PT, C, 4, PT; +--:-:-:-:1 IADD C, C, -2; +[+ + our ($vsize, $dsize, $convert_in); + my %insert = ( + + j0c3 => "--:-:-:-:1 IADD.X track1, track1, RZ;\n", + + j0c0 => "--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n", + j0c2 => "--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n", + j0c18 => "--:-:1:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + + j1c12 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n", + j1c14 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n", + j1c16 => "--:-:1:-:1 \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n", + + $convert_in ? ( + + j0c1 => "02:-:-:-:1 F2F.F32.F16 T03, T01.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T02, T01.H0;\n", + j0c4 => "--:-:-:-:1 F2F.F32.F16 T01, T00.H1;\n" . + "--:-:2:-:1 F2F.F32.F16 T00, T00.H0;\n", + + j0c5 => "--:-:-:-:1 F2F.F32.F16 T13, T11.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T12, T11.H0;\n", + j0c6 => "--:-:-:-:1 F2F.F32.F16 T11, T10.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T10, T10.H0;\n", + + j0c7 => "--:-:-:-:1 F2F.F32.F16 T23, T21.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T22, T21.H0;\n", + j0c8 => "--:-:-:-:1 F2F.F32.F16 T21, T20.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T20, T20.H0;\n", + + j0c9 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n", + j0c10 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n", + j0c11 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n", + + j0c13 => "02:-:-:-:1 \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n", + j0c14 => "10:-:-:-:1 \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n", + j0c15 => "20:-:2:-:1 \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n", + + j0c16 => "04:-:-:-:1 F2F.F32.F16 T33, T31.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T32, T31.H0;\n", + j0c17 => "--:-:-:-:1 F2F.F32.F16 T31, T30.H1;\n" . + "--:-:3:-:1 F2F.F32.F16 T30, T30.H0;\n", + + j0c19 => "--:-:-:-:1 F2F.F32.F16 T43, T41.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T42, T41.H0;\n", + j0c20 => "--:-:-:-:1 F2F.F32.F16 T41, T40.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T40, T40.H0;\n", + + j0c21 => "--:-:-:-:1 F2F.F32.F16 T53, T51.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T52, T51.H0;\n", + j0c22 => "--:-:-:-:1 F2F.F32.F16 T51, T50.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T50, T50.H0;\n", + + j0c23 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n", + j0c24 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n", + j0c25 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n", + + j0c27 => "04:-:-:-:1 \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n", + j0c28 => "10:-:-:-:1 \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n", + j0c29 => "20:-:3:-:1 \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n", + + j0c30 => "08:-:-:-:1 F2F.F32.F16 T63, T61.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T62, T61.H0;\n", + j0c31 => "--:-:-:-:1 F2F.F32.F16 T61, T60.H1;\n" . + "--:-:4:-:1 F2F.F32.F16 T60, T60.H0;\n", + + j1c0 => "--:-:-:-:1 F2F.F32.F16 T73, T71.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T72, T71.H0;\n", + j1c1 => "--:-:-:-:1 F2F.F32.F16 T71, T70.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T70, T70.H0;\n", + + j1c2 => "--:-:-:-:1 F2F.F32.F16 T83, T81.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T82, T81.H0;\n", + j1c3 => "--:-:-:-:1 F2F.F32.F16 T81, T80.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T80, T80.H0;\n", + + j1c4 => "08:4:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n", + j1c5 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n", + j1c6 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n", + + j1c8 => "08:-:-:-:1 \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n", + j1c9 => "10:-:-:-:1 \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n", + j1c10 => "20:6:4:-:1 \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n", + + ) : ( + + j0c6 => "02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0;\n", + j0c8 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n", + j0c10 => "--:2:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n", + + j0c12 => "02:-:-:-:1 \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n", + j0c14 => "--:-:-:-:1 \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n", + j0c16 => "--:-:2:-:1 \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n", + + j0c20 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n", + j0c22 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n", + j0c24 => "--:3:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n", + + j0c26 => "04:-:-:-:1 \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n", + j0c28 => "--:-:-:-:1 \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n", + j0c30 => "--:-:3:-:1 \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n", + + j1c0 => "08:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n", + j1c2 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n", + j1c4 => "--:4:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n", + + j1c6 => "08:-:-:-:1 \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n", + j1c8 => "--:-:-:-:1 \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n", + j1c10 => "--:6:4:-:1 \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n", + ), + + j1c11 => "--:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j1c31 => "--:-:-:Y:5 \@P0 BRA.U LOAD_LOOP;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "01" : '--'; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + +[- + our $trans1 = "0.343"; + our $trans2 = "0.700"; + our $trans3 = "0.490"; +-] + + diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_common.sass b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_common.sass new file mode 100644 index 0000000..f2a06e6 --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_common.sass @@ -0,0 +1,807 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--:-:1:-:2 S2R Tid, SR_TID.X; + +--:-:-:-:1 MOV alpha16, param_alpha; + +01:-:-:-:1 LOP.AND Tid32_2, Tid, -32; +--:-:-:-:1 SHR.U32 Tid32_2, Tid32_2, 2; + +// readFs = ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND Tid1, Tid, 1; +01:-:-:-:1 LOP.AND readFs, Tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD readFs, readFs, Tid1; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2) +--:-:-:-:1 BFE.U32 readIs, Tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, Tid32_2; +--:-:-:-:1 ISCADD readIs, readFs, readIs, 2; + +--:-:-:-:1 SHL readIs, readIs, 4; +--:-:-:-:1 SHL readFs, readFs, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:1 XMAD write16Cs, readFs, 1x<32*36>, readIs; + + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y2, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y2, alpha16; +--:-:-:-:4 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y3, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y3, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y6, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y6, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y7, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y7, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 EXIT; + +COMPUTE_FINISH: + +--:-:1:-:2 S2R tid_128, SR_TID.X; + + +--:-:-:-:1 MOV alpha, param_alpha; + +01:-:-:-:1 IADD tid_128, tid_128, -128; + +--:-:-:-:1 ISETP.GE.AND P4, PT, tid_128, 256, PT; + +// readFs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND Tid_1, tid_128, 1; +--:-:-:-:1 LOP.AND readFs2, tid_128, 8; +--:-:-:-:1 SHR.U32 readFs2, readFs2, 2; +--:-:-:-:1 IADD readFs2, readFs2, Tid_1; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2) +--:-:-:-:1 LOP.AND tid_16, tid_128, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; +--:-:-:-:1 BFE.U32 readIs2, tid_128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs2, readIs2, tid_16; +--:-:-:-:1 ISCADD readIs2, readFs2, readIs2, 2; + +--:-:-:-:1 ISCADD readIs2, readIs2, 4x<32*4>, 4; +--:-:-:-:1 SHL readFs2, readFs2, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:0 XMAD writeCs, readFs2, 1x<32*36>, readIs2; + + +--:-:-:-:5 @P4 BRA.U SKIP0; + +--:-:2:-:1 LDS idxX, [addr_idx_X]; +--:-:3:-:1 LDS idxY, [addr_idx_Y]; +--:-:1:-:1 S2R idxN, SR_CTAID.Z; +--:-:4:-:1 LDS idxK, [addr_idx_K]; + + +--:-:-:-:1 LOP.AND tid_31, tid_128, 31; +--:-:-:-:1 SHR.U32 tid_32, tid_128, 5; +--:-:-:-:1 SHR.U32 tid_64, tid_128, 6; + +[+ + our $bsum; return $bsum ? q{ +03:-:-:-:1 XMAD bsum_offset, idxX, param_gridN, idxN; +04:-:-:-:1 XMAD.LO2C bsum_offset, idxY, param_gridQN, bsum_offset; + } : ''; ++] + +--:-:-:-:1 MOV32I one, 1.0; + +// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16 +--:-:-:-:1 XMAD readCs, tid_32, 1x<32*36>, tid_31; +--:-:-:-:1 ISCADD readCs, tid_64, readCs, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = idxN*32 + tid & maskN +--:-:-:-:1 LOP.AND n, tid_31, param_maskN; +01:-:-:-:1 ISCADD n, idxN, n, 5; + +// Superblock offset +// idxX <<= shiftX +// idxX <<= shiftY +02:-:-:-:1 SHL idxX, idxX, param_shiftX; +04:-:-:-:1 SHL idxY, idxY, param_shiftY; + +// Get this threads offset within the superblock +--:-:-:-:1 BFE.U32 q, tid_31, param_superX; +--:-:-:-:1 BFE.U32 p, tid_31, param_superY; +--:-:-:-:1 ISCADD q, q, idxX, 2; +--:-:-:-:1 ISCADD p, p, idxY, 2; + +// k = idxK*32 + tid_32<<1 +--:-:-:-:1 SHL tid_32, tid_32, 1; +08:-:-:-:1 ISCADD k, idxK, tid_32, 5; + +// Out = k*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD offsetO, q, param_N, n; +--:-:-:-:1 XMAD.LO2C offsetO, p, param_QN, offsetO; +--:-:-:-:1 XMAD.LO2C offsetO, k, param_PQN, offsetO; + +--:-:-:-:1 IADD z1, q, 1; +--:-:-:-:1 IADD z2, q, 2; +--:-:-:-:1 IADD z3, q, 3; + +--:-:-:-:1 ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op +--:-:-:-:1 ISETP.LT.AND P6, PT, n, param_N, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, q, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_Q, P5; +--:-:-:-:1 ISETP.GE.AND P0, PT, q, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; +--:-:-:-:1 P2R mask_q, PR, RZ, 0x0f; + +--:-:-:-:1 IADD z1, p, 1; +--:-:-:-:1 IADD z2, p, 2; +--:-:-:-:1 IADD z3, p, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, p, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_P, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, p, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; + +--:-:-:-:1 SEL preds, mask_q, RZ, P0; +--:-:-:-:1 @P1 BFI preds, mask_q, 0x404, preds; +--:-:-:-:1 @P2 BFI preds, mask_q, 0x408, preds; +--:-:-:-:1 @P3 BFI preds, mask_q, 0x40c, preds; + +--:-:-:-:1 ISETP.EQ.AND P6, PT, tid_31, RZ, PT; + + +SKIP0: + + +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, ccx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, ccx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, ccx7y2, alpha; + +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP1; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 1; +--:-:-:-:1 IADD offsetO, offsetO, param_PQN; + +SKIP1: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y1, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP2; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 15; +--:-:-:-:1 IADD offsetO, offsetO, param_PQN15; + +SKIP2: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y4, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y6, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP3; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 1; +--:-:-:-:1 IADD offsetO, offsetO, param_PQN; + +SKIP3: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y5, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP4; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP4: + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + + +11:-:-:-:1 ISETP.LT.AND P5, PT, k, param_K, PT; +[+ + our $bias; + return $bias ? q{ +--:-:-:-:1 LEA Sum0.CC, k, param_S[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, k, param_S[1], RZ, 2; + +--:-:-:-:1 @!P5 MOV bias, RZ; +--:-:5:-:1 @P5 LDG.E.CI bias, [Sum]; + } : ''; ++] + + +[+ + my $out; + foreach my $i (0 .. 2) + { + foreach my $j (0 .. 5) + { + my $b = $i + 1; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + + +[+ + my $out; our ($trans1, $trans2, $trans3); + foreach my $i (0 .. 2) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ + +$w:-:-:-:1 FADD t0$i, m1$i, m2$i; +--:-:-:-:1 FADD t1$i, m1$i, -m2$i; +--:-:-:-:1 FADD t2$i, m3$i, -m4$i; +--:-:-:-:1 FADD t3$i, m3$i, m4$i; +--:-:-:-:1 FADD w0$i, t0$i, m0$i; +--:-:-:-:1 FMUL32I w3$i, t1$i, $trans1; +--:-:-:-:1 FMUL32I w1$i, t1$i, $trans2; +--:-:-:-:1 FMUL32I temp, t0$i, $trans3; +--:-:-:-:1 FFMA w3$i, t2$i, 3.375, w3$i; +--:-:-:-:1 FFMA w1$i, t2$i, 1.500, w1$i; +--:-:-:-:1 FFMA w2$i, t3$i, 2.250, temp; +--:-:-:-:1 FADD w0$i, w0$i, t3$i; +--:-:-:-:1 FADD w3$i, w3$i, m5$i; + + }; + } + foreach my $i (3 .. 5) + { + foreach my $j (0 .. 5) + { + my $b = $i + 1; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + + + +[+ + my $out; our ($trans1, $trans2, $trans3); + + foreach my $i (3 .. 5) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ + +$w:-:-:-:1 FADD t0$i, m1$i, m2$i; +--:-:-:-:1 FADD t1$i, m1$i, -m2$i; +--:-:-:-:1 FADD t2$i, m3$i, -m4$i; +--:-:-:-:1 FADD t3$i, m3$i, m4$i; +--:-:-:-:1 FADD w0$i, t0$i, m0$i; +--:-:-:-:1 FMUL32I w3$i, t1$i, $trans1; +--:-:-:-:1 FMUL32I w1$i, t1$i, $trans2; +--:-:-:-:1 FMUL32I temp, t0$i, $trans3; +--:-:-:-:1 FFMA w3$i, t2$i, 3.375, w3$i; +--:-:-:-:1 FFMA w1$i, t2$i, 1.500, w1$i; +--:-:-:-:1 FFMA w2$i, t3$i, 2.250, temp; +--:-:-:-:1 FADD w0$i, w0$i, t3$i; +--:-:-:-:1 FADD w3$i, w3$i, m5$i; + + }; + } + return $out; ++] +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + + + +[+ + my $out; + our ($convert_out, $bias, $relu, $trans1, $trans2, $trans3); + foreach my $i (0 .. 3) + { + $out .= qq{ +--:-:-:-:1 FADD r${i}0, w${i}1, w${i}2; +--:-:-:-:1 FADD r${i}1, w${i}1, -w${i}2; +--:-:-:-:1 FADD r${i}2, w${i}3, -w${i}4; +--:-:-:-:1 FADD r${i}3, w${i}3, w${i}4; +--:-:-:-:1 FADD s${i}0, r${i}0, w${i}0; +--:-:-:-:1 FMUL32I s${i}3, r${i}1, $trans1; +--:-:-:-:1 FMUL32I s${i}1, r${i}1, $trans2; +--:-:-:-:1 FMUL32I temp, r${i}0, $trans3; +--:-:-:-:1 FFMA s${i}3, r${i}2, 3.375, s${i}3; +--:-:-:-:1 FFMA s${i}1, r${i}2, 1.500, s${i}1; +--:-:-:-:1 FFMA s${i}2, r${i}3, 2.250, temp; +--:-:-:-:1 FADD s${i}0, s${i}0, r${i}3; +--:-:-:-:1 FADD s${i}3, s${i}3, w${i}5; + }; + if ($bias) + { + $out .= qq{ +10:-:-:-:1 FADD s${i}0, s${i}0, bias; +--:-:-:-:1 FADD s${i}1, s${i}1, bias; +--:-:-:-:1 FADD s${i}2, s${i}2, bias; +--:-:-:-:1 FADD s${i}3, s${i}3, bias;}; + } + if ($relu) + { + $out .= qq{ +--:-:-:-:1 FMNMX s${i}0, s${i}0, RZ, !PT; +--:-:-:-:1 FMNMX s${i}1, s${i}1, RZ, !PT; +--:-:-:-:1 FMNMX s${i}2, s${i}2, RZ, !PT; +--:-:-:-:1 FMNMX s${i}3, s${i}3, RZ, !PT;}; + } + } + return $out; ++] + + +[+ + our $prelu; my $out; + if ($prelu) + { + foreach my $i (0 .. 3) + { + $out .= qq{ +// maximum(x, 0) + beta * minimum(0, x) +--:-:-:-:1 FMNMX b00, s${i}0, RZ, !PT; +--:-:-:-:1 FMNMX b01, s${i}1, RZ, !PT; +--:-:-:-:1 FMNMX b02, s${i}2, RZ, !PT; +--:-:-:-:1 FMNMX b03, s${i}3, RZ, !PT; + +--:-:-:-:1 FMNMX b10, s${i}0, RZ, PT; +--:-:-:-:1 FMNMX b11, s${i}1, RZ, PT; +--:-:-:-:1 FMNMX b12, s${i}2, RZ, PT; +--:-:-:-:1 FMNMX b13, s${i}3, RZ, PT; + +--:-:-:-:1 FFMA s${i}0, b10, param_beta, b00; +--:-:-:-:1 FFMA s${i}1, b11, param_beta, b01; +--:-:-:-:1 FFMA s${i}2, b12, param_beta, b02; +--:-:-:-:1 FFMA s${i}3, b13, param_beta, b03; + }; + } + } + return $out; ++] + +[+ + our ($beta, $brelu, $bprelu, $dtype, $dsize, $dshift, $convert_out, $Q, $N); + my $out; + if ($beta || $brelu || $bprelu) + { + my $preds = $beta ? q{ +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + } : ''; + + $out .= qq{ + +--:-:-:-:1 LEA Out0.CC, offsetO, param_X[0], $dshift; +--:-:-:-:1 LEA.HI.X Out1, offsetO, param_X[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CG.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>]; +--:-:-:-:1 \@P1 LDG.E.CG.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CG.$dtype b02, [Out + ${dsize}x<0*$Q*$N + 2*$N>]; +--:-:1:-:1 \@P3 LDG.E.CG.$dtype b03, [Out + ${dsize}x<0*$Q*$N + 3*$N>]; +--:-:-:-:1 \@!P0 MOV b00, RZ; +--:-:-:-:1 \@!P1 MOV b01, RZ; +--:-:-:-:1 \@!P2 MOV b02, RZ; +--:-:-:-:1 \@!P3 MOV b03, RZ; +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + +--:-:-:-:1 \@P0 LDG.E.CG.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>]; +--:-:-:-:1 \@P1 LDG.E.CG.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CG.$dtype b12, [Out + ${dsize}x<1*$Q*$N + 2*$N>]; +--:-:2:-:1 \@P3 LDG.E.CG.$dtype b13, [Out + ${dsize}x<1*$Q*$N + 3*$N>]; +--:-:-:-:1 \@!P0 MOV b10, RZ; +--:-:-:-:1 \@!P1 MOV b11, RZ; +--:-:-:-:1 \@!P2 MOV b12, RZ; +--:-:-:-:1 \@!P3 MOV b13, RZ; +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + +--:-:-:-:1 \@P0 LDG.E.CG.$dtype b20, [Out + ${dsize}x<2*$Q*$N + 0*$N>]; +--:-:-:-:1 \@P1 LDG.E.CG.$dtype b21, [Out + ${dsize}x<2*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CG.$dtype b22, [Out + ${dsize}x<2*$Q*$N + 2*$N>]; +--:-:3:-:1 \@P3 LDG.E.CG.$dtype b23, [Out + ${dsize}x<2*$Q*$N + 3*$N>]; +--:-:-:-:1 \@!P0 MOV b20, RZ; +--:-:-:-:1 \@!P1 MOV b21, RZ; +--:-:-:-:1 \@!P2 MOV b22, RZ; +--:-:-:-:1 \@!P3 MOV b23, RZ; +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.L.U64 preds, preds, 12, preds; + +--:-:-:-:1 \@P0 LDG.E.CG.$dtype b30, [Out + ${dsize}x<3*$Q*$N + 0*$N>]; +--:-:-:-:1 \@P1 LDG.E.CG.$dtype b31, [Out + ${dsize}x<3*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CG.$dtype b32, [Out + ${dsize}x<3*$Q*$N + 2*$N>]; +--:-:4:-:1 \@P3 LDG.E.CG.$dtype b33, [Out + ${dsize}x<3*$Q*$N + 3*$N>]; +--:-:-:-:1 \@!P0 MOV b30, RZ; +--:-:-:-:1 \@!P1 MOV b31, RZ; +--:-:-:-:1 \@!P2 MOV b32, RZ; +--:-:-:-:1 \@!P3 MOV b33, RZ;$preds +}; + + if ($convert_out) + { + $out .= q{ +01:-:-:-:1 F2F.F32.F16 b00, b00; +--:-:-:-:1 F2F.F32.F16 b01, b01; +--:-:-:-:1 F2F.F32.F16 b02, b02; +--:-:1:-:1 F2F.F32.F16 b03, b03; +02:-:-:-:1 F2F.F32.F16 b10, b10; +--:-:-:-:1 F2F.F32.F16 b11, b11; +--:-:-:-:1 F2F.F32.F16 b12, b12; +--:-:2:-:1 F2F.F32.F16 b13, b13; +04:-:-:-:1 F2F.F32.F16 b20, b20; +--:-:-:-:1 F2F.F32.F16 b21, b21; +--:-:-:-:1 F2F.F32.F16 b22, b22; +--:-:3:-:1 F2F.F32.F16 b23, b23; +08:-:-:-:1 F2F.F32.F16 b30, b30; +--:-:-:-:1 F2F.F32.F16 b31, b31; +--:-:-:-:1 F2F.F32.F16 b32, b32; +--:-:4:-:1 F2F.F32.F16 b33, b33;}; + } + } + return $out; ++] + + +[+ + our $beta; return $beta ? q{ +01:-:-:-:1 FFMA s00, b00, param_beta, s00; +--:-:-:-:1 FFMA s01, b01, param_beta, s01; +--:-:-:-:1 FFMA s02, b02, param_beta, s02; +--:-:-:-:1 FFMA s03, b03, param_beta, s03; +02:-:-:-:1 FFMA s10, b10, param_beta, s10; +--:-:-:-:1 FFMA s11, b11, param_beta, s11; +--:-:-:-:1 FFMA s12, b12, param_beta, s12; +--:-:-:-:1 FFMA s13, b13, param_beta, s13; +04:-:-:-:1 FFMA s20, b20, param_beta, s20; +--:-:-:-:1 FFMA s21, b21, param_beta, s21; +--:-:-:-:1 FFMA s22, b22, param_beta, s22; +--:-:-:-:1 FFMA s23, b23, param_beta, s23; +08:-:-:-:1 FFMA s30, b30, param_beta, s30; +--:-:-:-:1 FFMA s31, b31, param_beta, s31; +--:-:-:-:1 FFMA s32, b32, param_beta, s32; +--:-:-:-:1 FFMA s33, b33, param_beta, s33;} : ''; ++] +[+ + our ($brelu, $bprelu); my $out; + if ($brelu || $bprelu) + { + foreach my $i (0 .. 3) + { + my $w = sprintf "%02x", 1 << $i; + $out .= $brelu ? qq{ +//delta *= (x > 0) +$w:-:-:-:1 FSETP.GT.AND P0, PT, b${i}0, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b${i}1, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b${i}2, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b${i}3, RZ, PT; +--:-:-:-:1 \@!P0 MOV s${i}0, RZ; +--:-:-:-:1 \@!P1 MOV s${i}1, RZ; +--:-:-:-:1 \@!P2 MOV s${i}2, RZ; +--:-:-:-:1 \@!P3 MOV s${i}3, RZ; + } : qq{ +//delta *= ((x > 0) + slope * (x < 0)) +$w:-:-:-:1 FSETP.GT.AND P0, PT, b${i}0, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b${i}1, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b${i}2, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b${i}3, RZ, PT; +--:-:-:-:1 SEL xx0, one, RZ, P0; +--:-:-:-:1 SEL xx1, one, RZ, P1; +--:-:-:-:1 SEL xx2, one, RZ, P2; +--:-:-:-:1 SEL xx3, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b${i}0, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b${i}1, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b${i}2, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b${i}3, RZ, PT; +--:-:-:-:1 SEL b${i}0, one, RZ, P0; +--:-:-:-:1 SEL b${i}1, one, RZ, P1; +--:-:-:-:1 SEL b${i}2, one, RZ, P2; +--:-:-:-:1 SEL b${i}3, one, RZ, P3; +--:-:-:-:1 FFMA b${i}0, b${i}0, param_beta, xx0; +--:-:-:-:1 FFMA b${i}1, b${i}1, param_beta, xx1; +--:-:-:-:1 FFMA b${i}2, b${i}2, param_beta, xx2; +--:-:-:-:1 FFMA b${i}3, b${i}3, param_beta, xx3; +--:-:-:-:1 FMUL s${i}0, s${i}0, b${i}0; +--:-:-:-:1 FMUL s${i}1, s${i}1, b${i}1; +--:-:-:-:1 FMUL s${i}2, s${i}2, b${i}2; +--:-:-:-:1 FMUL s${i}3, s${i}3, b${i}3; + }; + } + $out .= q{ +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:5 @!P5 R2P PR, RZ, 0x0f; +--:-:-:-:5 SHF.R.U64 preds, preds, 4, preds; +}; + } + return $out; ++] + +[+ + our $bsum; my $out; + if ($bsum) + { + $out = q{ + +--:-:-:-:1 XMAD.LO2C bias, k, param_gridPQN, bsum_offset; +--:-:-:-:1 LEA Sum0.CC, bias, param_S[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, bias, param_S[1], RZ, 2; +--:-:-:-:1 MOV sum0, RZ; +--:-:-:-:1 MOV sum1, RZ; +--:-:-:-:1 MOV sum2, RZ; +--:-:-:-:1 MOV sum3, RZ;}; + foreach my $i (0 .. 3) + { + my ($dir, $amt) = $i == 2 ? ('L','12') : ('R','4'); + $out .= qq{ +--:-:-:-:1 \@P0 FADD sum0, sum0, s${i}0; +--:-:-:-:1 \@P1 FADD sum1, sum1, s${i}1; +--:-:-:-:1 \@P2 FADD sum2, sum2, s${i}2; +--:-:-:-:1 \@P3 FADD sum3, sum3, s${i}3; +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.$dir.U64 preds, preds, $amt, preds;}; + } + $out .= q{ +--:-:-:-:1 FADD sum0, sum0, sum1; +--:-:-:-:1 FADD sum2, sum2, sum3; +--:-:-:-:1 FADD sum0, sum0, sum2; +}; + } + return $out; ++] +[+ + our $convert_out; return $convert_out ? q{ +--:-:-:-:1 F2F.F16.F32 s00, s00; +--:-:-:-:1 F2F.F16.F32 s01, s01; +--:-:-:-:1 F2F.F16.F32 s02, s02; +--:-:1:-:1 F2F.F16.F32 s03, s03; +--:-:-:-:1 F2F.F16.F32 s10, s10; +--:-:-:-:1 F2F.F16.F32 s11, s11; +--:-:-:-:1 F2F.F16.F32 s12, s12; +--:-:2:-:1 F2F.F16.F32 s13, s13; +--:-:-:-:1 F2F.F16.F32 s20, s20; +--:-:-:-:1 F2F.F16.F32 s21, s21; +--:-:-:-:1 F2F.F16.F32 s22, s22; +--:-:3:-:1 F2F.F16.F32 s23, s23; +--:-:-:-:1 F2F.F16.F32 s30, s30; +--:-:-:-:1 F2F.F16.F32 s31, s31; +--:-:-:-:1 F2F.F16.F32 s32, s32; +--:-:4:-:1 F2F.F16.F32 s33, s33;} : ''; ++] + +[+ + our ($bsum, $dtype, $dsize, $dshift, $Q, $N); + return $bsum ? qq{ +--:-:-:Y:6 LEA Out0.CC, offsetO, param_O[0], $dshift; +--:-:-:-:0 LEA.HI.X Out1, offsetO, param_O[1], RZ, $dshift; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 1, 0x1f; +01:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03; +--:-:-:-:2 \@P5 R2P PR, preds, 0x0f; +--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f; + +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:-:-:0 SHF.R.U64 preds, preds, 4, preds; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 2, 0x1f; + +02:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13; +--:-:-:-:2 \@P5 R2P PR, preds, 0x0f; +--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f; + +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:-:-:0 SHF.R.U64 preds, preds, 4, preds; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 4, 0x1f; + +04:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23; +--:-:-:-:2 \@P5 R2P PR, preds, 0x0f; +--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f; + +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:-:-:0 SHF.L.U64 preds, preds, 12, preds; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 8, 0x1f; + +08:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32; +--:1:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33; + +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:-:-:0 PSETP.AND.AND P5, PT, P5, P6, PT; // k < K && tid31 == 0 +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 16, 0x1f; +10:-:-:-:2 FADD sum0, sum1, sum0; +--:5:-:-:1 \@P5 STG.E.CG [Sum], sum0; + } : qq{ + + +--:-:-:-:1 LEA Out0.CC, offsetO, param_O[0], $dshift; +--:-:-:-:1 LEA.HI.X Out1, offsetO, param_O[1], RZ, $dshift; + +01:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + +02:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + +04:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.L.U64 preds, preds, 12, preds; + +08:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32; +--:1:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33; + + + }; ++] + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Maxwell/xconv_xprop_common.sass b/Kernel/Convolution/Maxwell/xconv_xprop_common.sass new file mode 100644 index 0000000..110dc4d --- /dev/null +++ b/Kernel/Convolution/Maxwell/xconv_xprop_common.sass @@ -0,0 +1,841 @@ + + +[- + # Kernel Options: + our ($beta, $bias, $relu, $prelu, $brelu, $bprelu, $bsum); + + # set externally + our ($prefix, $prop, $shareI, $shareF, $stepI, $stepF, $remapI, $remapF); + + our $addr_shift = $prefix eq 's' ? 2 : 1; + our $half = $prefix eq 'h'; + + sub params + { + return <<'EOF'; + param_Sum[0] : c[0x0][0x140] + param_Sum[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_N : c[0x0][0x174] + param_K : c[0x0][0x178] + param_D : c[0x0][0x17c] + param_H : c[0x0][0x180] + param_W : c[0x0][0x184] + param_WN : c[0x0][0x188] + param_HWN : c[0x0][0x18c] + param_DHWN : c[0x0][0x190] + param_C : c[0x0][0x194] + param_KRST : c[0x0][0x198] + param_RST : c[0x0][0x19c] + param_RS : c[0x0][0x1a0] + param_T : c[0x0][0x1a4] + param_R : c[0x0][0x1a8] + param_S : c[0x0][0x1ac] + param_magic_RS : c[0x0][0x1b0] + param_shift_RS : c[0x0][0x1b4] + param_magic_S : c[0x0][0x1b8] + param_shift_S : c[0x0][0x1bc] + param_pad_d : c[0x0][0x1c0] + param_pad_h : c[0x0][0x1c4] + param_pad_w : c[0x0][0x1c8] + param_str_d : c[0x0][0x1cc] + param_str_h : c[0x0][0x1d0] + param_str_w : c[0x0][0x1d4] + param_dil_d : c[0x0][0x1d8] + param_dil_h : c[0x0][0x1dc] + param_dil_w : c[0x0][0x1e0] + param_P2 : c[0x0][0x1e4] + param_Q : c[0x0][0x1e8] + param_PQk : c[0x0][0x1ec] + param_Qk : c[0x0][0x1f0] + param_k : c[0x0][0x1f4] + param_magic_PQk : c[0x0][0x1f8] + param_shift_PQk : c[0x0][0x1fc] + param_magic_Qk : c[0x0][0x200] + param_shift_Qk : c[0x0][0x204] + param_magic_k : c[0x0][0x208] + param_shift_k : c[0x0][0x20c] + param_QN : c[0x0][0x210] + param_PQN : c[0x0][0x214] + param_MPQN : c[0x0][0x218] + param_gridN : c[0x0][0x21c] + param_gridQN : c[0x0][0x220] + param_gridPQN : c[0x0][0x224] + param_gridMPQN : c[0x0][0x228] + param_magic_str_d : c[0x0][0x22c] + param_shift_str_d : c[0x0][0x230] + param_magic_str_h : c[0x0][0x234] + param_shift_str_h : c[0x0][0x238] + param_magic_str_w : c[0x0][0x23c] + param_shift_str_w : c[0x0][0x240] +EOF + } + + sub get_mpqk + { + return <<'EOF'; +// idx_M = idx_MPQk / blk_PQk +--:-:-:-:1 MOV magic_PQk, param_magic_PQk; +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_PQk, 1, PT; +02:-:-:-:1 @P1 XMAD div1, idx_MPQk, magic_PQk, RZ; +--:-:-:-:1 @P1 XMAD div2, idx_MPQk, magic_PQk.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, idx_MPQk.H1, magic_PQk.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, idx_MPQk.H1, magic_PQk, div1; +--:-:-:-:1 @P1 IADD3.RS idx_M, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 idx_M, idx_M, param_shift_PQk; +--:-:-:-:1 @!P1 SHR.U32 idx_M, idx_MPQk, param_shift_PQk; + +// idx_PQk = idx_PQk % blk_Qk +--:-:-:-:1 IADD neg_PQk, RZ, -param_PQk; +--:-:-:-:1 XMAD.LO2 idx_PQk, neg_PQk, idx_M, idx_MPQk; + +// idx_P2 = idx_PQk / blk_Qk +--:-:-:-:1 MOV magic_Qk, param_magic_Qk; +--:-:-:-:1 ISETP.NE.AND P2, PT, magic_Qk, 1, PT; +--:-:-:-:1 @P2 XMAD div1, idx_PQk, magic_Qk, RZ; +--:-:-:-:1 @P2 XMAD div2, idx_PQk, magic_Qk.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, idx_PQk.H1, magic_Qk.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, idx_PQk.H1, magic_Qk, div1; +--:-:-:-:1 @P2 IADD3.RS idx_P2, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 idx_P2, idx_P2, param_shift_Qk; +--:-:-:-:1 @!P2 SHR.U32 idx_P2, idx_PQk, param_shift_Qk; + +// idx_Qk = idx_PQk % blk_Qk +--:-:-:-:1 IADD neg_Qk, RZ, -param_Qk; +--:-:-:-:1 XMAD.LO2 idx_Qk, neg_Qk, idx_P2, idx_PQk; + +// idx_Q2 = idx_Qk / k +--:-:-:-:1 XMAD.LO2C idx_Q2, idx_Qk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_Q2, idx_Q2, param_shift_k; +// idx_k = idx_Qk % k +--:-:-:-:1 IADD neg_k, RZ, -param_k; +--:-:-:-:1 XMAD.S16.U16 idx_k, neg_k, idx_Q2, idx_Qk; + +// idx_K = idx_K * blk_k + idx_k +04:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// idx_P = idx_P2 * 2 +// idx_Q = idx_Q2 +// if idx_P2 != gridP2: +// idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1) +// idx_Q = idx_Q2 >> 1 +--:-:-:-:1 ISETP.NE.AND P1, PT, idx_P2, param_P2, PT; +--:-:-:-:1 SHL idx_P, idx_P2, 1; +--:-:-:-:1 @P1 LOP.AND q1, idx_Q2, 1; +--:-:-:-:1 @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P1 LOP.XOR q1, q1, q2; +--:-:-:-:1 @P1 IADD idx_P, idx_P, q1; +--:-:-:-:1 @P1 SHR.U32 idx_Q, idx_Q2, 1; +--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2; + +// Scan backwards on odd rows +// if idx_P2 & 1: +// idx_Q = Q - idx_Q - 1 +--:-:-:-:1 LOP.AND.NZ P2, RZ, idx_P2, 1; +--:-:-:-:1 MOV negOne, -1; +--:-:-:-:1 @P2 IADD3 idx_Q, -idx_Q, param_Q, negOne; + +EOF + } + + sub load_zeros + { + return "--:-:-:-:1 STS.128 [addr_zero], RZ;\n" . + join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + } + + sub begin_lut + { + return <<'EOF'; +--:-:-:-:5 @P0 BRA.U END_SETUP; + + +--:-:-:-:1 STS.128 [addr_mpqk], mpqk; + +--:-:-:-:1 MOV rst, tid; +--:-:-:-:1 MOV lutStore2, RZ; +--:-:-:-:1 MOV lutSize, RZ; +--:-:-:-:1 MOV warp_count, 32; + +--:-:-:-:1 IADD mask_shr, -tid, 32; +--:-:-:-:1 SHR.U32 dep_thd_mask, negOne, mask_shr; +EOF + } + + sub end_lut + { + return sprintf <<'EOF', $addr_shift; + +// Get a mask of all valid slices in the warp +--:-:-:-:1 VOTE.ANY ballot, PT, P1; +// Count the total valid slices +--:-:2:-:1 POPC warp_slices, ballot; +// Prepare lutStore for this and next loop +--:-:-:-:1 @P1 MOV lutStore, lutStore2; +02:-:-:-:1 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +--:-:-:-:1 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +--:-:3:-:1 @P1 POPC dep_thd_cnt, dep_thd_bits; +// use the rst increment to space the barrier sync +--:-:-:-:1 IADD rst, rst, 32; +// Update the lutStore address from this count +04:-:-:-:1 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +--:1:-:-:1 @P1 STS.64 [lutStore + addr_lut], sliceIF; + +// Keep track of the total size of the lut +--:-:-:-:1 IADD lutSize, lutSize, warp_slices; + + +--:-:-:-:5 @P0 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +--:1:-:-:2 STS [addr_szLut], lutSize; + +END_SETUP: + +01:-:-:-:5 BAR.SYNC 0; + +// Grab the caclulated lut size and get it's reciprical +// Get the total reduction depth +--:-:1:-:2 LDS lutSize, [addr_szLut]; +01:-:-:-:0 XMAD endCRST, lutSize, param_C, RZ; +--:-:1:-:2 I2F.F32.S32 lutSizeRcp, lutSize; +01:-:1:-:1 MUFU.RCP lutSizeRcp, lutSizeRcp; + + +// lutSize != 0 +--:-:-:-:1 LOP.AND.NZ P0, RZ, lutSize, -1; +// posCRST = endCRST - tidY - 1 +--:-:-:-:1 IADD3 posCRST, endCRST, -1, -tidY; +// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch. +// If it is a multiple of 8 then make a full 8 line fetch. +--:-:-:-:1 LOP.AND.Z P1, partial, endCRST, 7; +--:-:-:-:1 @P1 MOV partial, 8; +// channel = posCRST / lutSize +// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it +--:-:2:-:1 I2F.F32.S32 posCRSTf, posCRST; +03:-:-:-:1 FMUL channel, posCRSTf, lutSizeRcp; +--:-:-:-:1 FFMA channel, channel, 5.9604644775390625e-08, channel; +--:-:2:-:1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +02:-:-:-:1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST; +--:-:-:-:1 SHL lutOffset, lutOffset, 3; +// P1 = tidY < partial && +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY, partial, P0; +// offsetIC = channel * DHWN +// offsetFC = channel * K +--:-:-:-:1 XMAD.LO2C offsetIc, channel, param_DHWN, RZ; +--:-:-:-:1 XMAD offsetFc, channel, param_KRST, RZ; +// posCRST -= partial +--:-:-:-:1 IADD posCRST, posCRST, -partial; +--:-:1:-:2 @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut]; + + +// trackI = offsetIN + offsetIC + sliceI + param_I +// trackF = offsetFK + offsetFC + sliceF + param_F +01:-:-:-:1 @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF; +--:-:-:-:5 @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI; +--:-:-:-:6 @P1 LEA trackF0.CC, offsetF, param_F[0], %1$s; +--:-:-:-:1 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, %1$s; +--:-:-:-:6 @P1 LEA trackI0.CC, offsetI, param_I[0], %1$s; +--:-:-:-:0 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, %1$s; +EOF + } + + sub fprop_lut + { + return begin_lut() . <<'EOF' . end_lut(); +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +--:-:-:-:1 XMAD qs, q, param_str_w, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 IADD qs, qs, -param_pad_w; +--:-:-:-:1 IADD pr, pr, -param_pad_h; +--:-:-:-:1 IADD mt, mt, -param_pad_d; + + +LUT_LOOP: + + +// warp synchronous loop while warp_count < RST +--:-:-:-:1 ISETP.LT.AND P0, PT, warp_count, param_RST, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, rst, param_RST, PT; + +--:-:-:-:1 IADD warp_count, warp_count, 32; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = qs + (s * dil_w) +// y = pr + (r * dil_h) +// z = mt + (t * dil_d) +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 ISETP.GE.AND P4, PT, x, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P5, PT, y, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P6, PT, z, RZ, P6; +--:-:-:-:1 ISETP.LT.AND P4, PT, x, param_W, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, y, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, z, param_D, P6; +--:-:-:-:1 PSETP.AND.AND P1, PT, P4, P5, P6; + + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD sliceI, x, param_N, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, sliceI; +// sliceF = rst * K +--:-:-:-:1 XMAD sliceF, rst, param_K, RZ; + +EOF + } + + sub bprop_lut + { + return begin_lut() . <<'EOF' . end_lut(); +--:-:-:-:1 MOV str_d, param_str_d; +--:-:-:-:1 MOV str_h, param_str_h; +--:-:-:-:1 MOV str_w, param_str_w; +// qs = q - pad_w +// pr = p - pad_h +// mt = m - pad_d +--:-:-:-:1 IADD qs, q, -param_pad_w; +--:-:-:-:1 IADD pr, p, -param_pad_h; +--:-:-:-:1 IADD mt, m, -param_pad_d; + + +LUT_LOOP: + + +// warp synchronous loop while warp_count < RST +--:-:-:-:1 ISETP.LT.AND P0, PT, warp_count, param_RST, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, rst, param_RST, PT; +--:-:-:-:1 IADD warp_count, warp_count, 32; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = qs + (s * dil_w) +// y = pr + (r * dil_h) +// z = mt + (t * dil_d) +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 ISETP.GE.AND P4, PT, x, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P5, PT, y, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P6, PT, z, RZ, P6; +// x_prime = x / str_w +// x = x % str_w +--:-:-:-:1 XMAD x_prime, x, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x_prime, x_prime, param_shift_str_w; +--:-:-:-:1 VMAD.U16.U16 x, -x_prime, str_w, x; +// y_prime = y / str_h +// y = y % str_h +--:-:-:-:1 XMAD y_prime, y, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y_prime, y_prime, param_shift_str_h; +--:-:-:-:1 VMAD.U16.U16 y, -y_prime, str_h, y; +// z_prime = z / str_d +// z = z % str_d +--:-:-:-:1 XMAD z_prime, z, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z_prime, z_prime, param_shift_str_d; +--:-:-:-:1 VMAD.U16.U16 z, -z_prime, str_d, z; + +--:-:-:-:1 ISETP.EQ.AND P4, PT, x, RZ, P4; +--:-:-:-:1 ISETP.EQ.AND P5, PT, y, RZ, P5; +--:-:-:-:1 ISETP.EQ.AND P6, PT, z, RZ, P6; +--:-:-:-:1 ISETP.LT.AND P4, PT, x_prime, param_W, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, y_prime, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, z_prime, param_D, P6; +--:-:-:-:1 PSETP.AND.AND P1, PT, P4, P5, P6; + +// sliceI = z_prime*HWN + y_prime*WN + x_prime*N +01:-:-:-:1 XMAD sliceI, x_prime, param_N, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y_prime, param_WN, sliceI; +--:-:-:-:1 XMAD.LO2C sliceI, z_prime, param_HWN, sliceI; +// sliceF = rst_prime * K +01:-:-:-:1 XMAD sliceF, rst, param_K, RZ; +EOF + } + + sub load_lut + { + return $prop eq 'f' ? fprop_lut() : bprop_lut(); + } + + sub loop_setup + { + my $swap; + if ($shareI == $shareF) + { + $swap = <<'EOF'; +--:-:-:-:0 LOP.XOR writeS, writeS, 4x; +EOF + } + else + { + $swap = <<'EOF'; +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD writeFs, writeFs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; +EOF + } + return sprintf <<'EOF', $shareI, $shareF, $stepI, $stepF, $addr_shift, $swap; + +--:-:-:-:0 ISETP.GE.AND P1, PT, posCRST, RZ, PT; +--:-:2:-:1 I2F.F32.S32 posCRSTf, posCRST; + +01:-:-:-:5 BAR.SYNC 0; +%6$s + +--:-:-:-:1 LDS.U.128 j0Ix0, [readIs + 4x<0*%1$-3s + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*%2$-3s + 00>]; +--:-:-:-:1 LDS.U.128 j0Ix4, [readIs + 4x<0*%1$-3s + %3$s>]; +--:-:1:-:2 LDS.U.128 j0Fy4, [readFs + 4x<0*%2$-3s + %4$s>]; + + +// channel = posCRST / lutSize +02:-:-:-:1 @P1 FMUL channel, posCRSTf, lutSizeRcp; +--:-:-:-:1 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel; +--:-:2:-:1 @P1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +02:-:-:-:1 @P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST; +--:-:-:-:1 @P1 SHL lutOffset, lutOffset, 3; +// offsetIC = channel * DHWN +// offsetFC = channel * K +--:-:-:-:1 @P1 XMAD.LO2C offsetIc, channel, param_DHWN, RZ; +--:-:-:-:1 @P1 XMAD offsetFc, channel, param_KRST, RZ; + +--:-:-:-:1 IADD posCRST, posCRST, -8; +--:-:2:-:2 @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut]; + + +// trackI = offsetIN + offsetIC + sliceI + param_I +// trackF = offsetFK + offsetFC + sliceF + param_F +02:-:-:-:1 @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF; +--:-:-:-:5 @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI; +--:-:-:-:6 @P1 LEA trackF0.CC, offsetF, param_F[0], %5$s; +--:-:-:-:1 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, %5$s; +--:-:-:-:6 @P1 LEA trackI0.CC, offsetI, param_I[0], %5$s; +--:-:-:-:0 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, %5$s; +EOF + } + + sub main_loop + { + our %insert; + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx0, [readIs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareI; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareF; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx4, [readIs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareI, $stepI; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareF, $stepF; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + } + + + sub output_setup + { + my ($tidOX, $warp_shift, $bsum_shift) = @_; + my $out; + + $out .= qq{ +02:-:-:-:1 SHR.U32 bsum_offset, tidOX, $bsum_shift; +04:-:-:-:1 ISCADD bsum_offset, idx_N, bsum_offset, $warp_shift; +01:-:-:-:1 XMAD bsum_offset, idx_Q, param_gridN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_P, param_gridQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset; + +--:-:-:-:1 LOP.AND.Z P5, RZ, tidOX, $tidOX; + } if $bsum; + + $out .= qq{ +// out_offset = m*PQN + p*QN + q*N + n +01:-:-:-:1 XMAD out_offset, q, param_N, n; +--:-:-:-:1 XMAD.LO2C out_offset, p, param_QN, out_offset; +--:-:-:-:1 XMAD.LO2C out_offset, m, param_PQN, out_offset; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV32I one, 1.0; + +--:-:-:-:1 ISETP.EQ.AND P2, PT, RZ, param_flags, PT; // no output +--:-:-:-:1 ISETP.LT.AND P0, PT, n, param_N, P2; + }; + + $out .= $half ? q{ +--:-:-:-:1 ISETP.LT.AND P1, PT, n, param_N, P2; + } : qq{ +--:-:-:-:1 IADD n, n, $stepI; +--:-:-:-:1 ISETP.LT.AND P1, PT, n, param_N, P2; + }; + return $out; + } + + sub output + { + my $out = q{ +--:-:-:-:5 BAR.SYNC 0; + }; + + foreach my $y (0..7) + { + my $incK = $y == 4 && !$remapF ? $stepF-3 : 1; + my $stepK = $y ? "\n--:-:-:-:1 IADD k, k, $incK;" : ""; + + $out .= qq{$stepK +--:-:-:-:1 FMUL cs0, cx0y$y, alpha; +--:-:-:-:1 FMUL cs1, cx1y$y, alpha; +--:-:-:-:1 FMUL cs2, cx2y$y, alpha; +--:-:-:-:1 FMUL cs3, cx3y$y, alpha; +--:-:-:-:1 FMUL cs4, cx4y$y, alpha; +--:-:-:-:1 FMUL cs5, cx5y$y, alpha; +--:-:-:-:1 FMUL cs6, cx6y$y, alpha; +--:-:-:-:0 FMUL cs7, cx7y$y, alpha; +--:-:-:-:5 CAL STORE_O; + }; + } + $out .= q{ + +--:-:-:-:5 EXIT; + +STORE_O: + + +30:-:-:-:1 XMAD offset, k, param_MPQN, out_offset; +--:-:-:-:1 XMAD.PSL offset, k, param_MPQN.H1, offset; +--:-:-:-:1 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n < N +--:-:-:-:1 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n < N + }; + + if ($beta || $brelu || $bprelu) + { + $out .= qq{ +--:-:-:-:1 LEA Out0.CC, offset, param_X[0], $addr_shift; +--:-:-:-:1 LEA.HI.X Out1, offset, param_X[1], RZ, $addr_shift; + }; + $out .= $half ? q{ +--:-:5:-:2 @P2 LDG.E.128 b0, [Out]; + } : q{ +--:-:5:-:1 @P2 LDG.E.128 b0, [Out + 4x<00>]; +--:-:6:-:1 @P3 LDG.E.128 b4, [Out + 4x<$stepI>]; + }; + } + + $out .= q{ +--:-:-:-:1 LEA Sum0.CC, k, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, k, param_Sum[1], RZ, 2; + +--:-:6:-:1 @P2 LDG.E.CI b0, [Sum]; +--:-:-:-:1 @!P2 MOV b0, RZ; + } if $bias; + + $out .= q{ + +--:-:-:-:1 STS.128 [writeCs + 4x<00>], cs0; +--:-:-:-:1 STS.128 [writeCs + 4x<$remapI ? 4 : $stepI>], cs4; +--:-:1:-:1 @P2 LDS.U.128 out0, [readCs + 4x<00>]; +--:-:2:-:1 @P3 LDS.U.128 out4, [readCs + 4x<$half ? 4 : $stepI>]; + + + + }; + + $out .= q{ +21:-:-:-:1 FADD out0, out0, b0; +--:-:-:-:1 FADD out1, out1, b0; +--:-:-:-:1 FADD out2, out2, b0; +--:-:-:-:1 FADD out3, out3, b0; +02:-:-:-:1 FADD out4, out4, b0; +--:-:-:-:1 FADD out5, out5, b0; +--:-:-:-:1 FADD out6, out6, b0; +--:-:-:-:1 FADD out7, out7, b0; + } if $bias; + + $out .= q{ +01:-:-:-:1 FMNMX out0, out0, RZ, !PT; +--:-:-:-:1 FMNMX out1, out1, RZ, !PT; +--:-:-:-:1 FMNMX out2, out2, RZ, !PT; +--:-:-:-:1 FMNMX out3, out3, RZ, !PT; +02:-:-:-:1 FMNMX out4, out4, RZ, !PT; +--:-:-:-:1 FMNMX out5, out5, RZ, !PT; +--:-:-:-:1 FMNMX out6, out6, RZ, !PT; +--:-:-:-:1 FMNMX out7, out7, RZ, !PT; + } if $relu; + + $out .= q{ +// maximum(x, 0) + slope * minimum(0, x) +01:-:-:-:1 FMNMX b0, out0, RZ, !PT; +--:-:-:-:1 FMNMX b1, out1, RZ, !PT; +--:-:-:-:1 FMNMX b2, out2, RZ, !PT; +--:-:-:-:1 FMNMX b3, out3, RZ, !PT; +02:-:-:-:1 FMNMX b4, out4, RZ, !PT; +--:-:-:-:1 FMNMX b5, out5, RZ, !PT; +--:-:-:-:1 FMNMX b6, out6, RZ, !PT; +--:-:-:-:1 FMNMX b7, out7, RZ, !PT; + +--:-:-:-:1 FMNMX x0, out0, RZ, PT; +--:-:-:-:1 FMNMX x1, out1, RZ, PT; +--:-:-:-:1 FMNMX x2, out2, RZ, PT; +--:-:-:-:1 FMNMX x3, out3, RZ, PT; +--:-:-:-:1 FMNMX x4, out4, RZ, PT; +--:-:-:-:1 FMNMX x5, out5, RZ, PT; +--:-:-:-:1 FMNMX x6, out6, RZ, PT; +--:-:-:-:1 FMNMX x7, out7, RZ, PT; + +--:-:-:-:1 FFMA out0, x0, param_beta, b0; +--:-:-:-:1 FFMA out1, x1, param_beta, b1; +--:-:-:-:1 FFMA out2, x2, param_beta, b2; +--:-:-:-:1 FFMA out3, x3, param_beta, b3; +--:-:-:-:1 FFMA out4, x4, param_beta, b4; +--:-:-:-:1 FFMA out5, x5, param_beta, b5; +--:-:-:-:1 FFMA out6, x6, param_beta, b6; +--:-:-:-:1 FFMA out7, x7, param_beta, b7; + } if $prelu; + + $out .= q{ + + }; + + $out .= q{ +13:-:-:-:1 @P2 F2F.F32.F16 b7, b3.H1; +--:-:-:-:1 @P2 F2F.F32.F16 b6, b3.H0; +--:-:-:-:1 @P2 F2F.F32.F16 b5, b2.H1; +--:-:-:-:1 @P2 F2F.F32.F16 b4, b2.H0; +--:-:-:-:1 @P2 F2F.F32.F16 b3, b1.H1; +--:-:-:-:1 @P2 F2F.F32.F16 b2, b1.H0; +--:-:-:-:1 @P2 F2F.F32.F16 b1, b0.H1; +--:-:5:-:2 @P2 F2F.F32.F16 b0, b0.H0; + } if $half && ($beta || $brelu || $bprelu); + + $out .= q{ + + }; + + $out .= q{ +11:-:-:-:1 @P2 FFMA out0, b0, param_beta, out0; +--:-:-:-:1 @P2 FFMA out1, b1, param_beta, out1; +--:-:-:-:1 @P2 FFMA out2, b2, param_beta, out2; +--:-:-:-:1 @P2 FFMA out3, b3, param_beta, out3; +22:-:-:-:1 @P3 FFMA out4, b4, param_beta, out4; +--:-:-:-:1 @P3 FFMA out5, b5, param_beta, out5; +--:-:-:-:1 @P3 FFMA out6, b6, param_beta, out6; +--:-:-:-:1 @P3 FFMA out7, b7, param_beta, out7; + } if $beta; + + $out .= q{ +//delta *= (x > 0) +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +11:-:-:-:1 FSETP.GT.AND P0, PT, b0, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b1, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b2, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b3, RZ, PT; +--:-:-:-:1 @!P0 MOV out0, RZ; +--:-:-:-:1 @!P1 MOV out1, RZ; +--:-:-:-:1 @!P2 MOV out2, RZ; +--:-:-:-:1 @!P3 MOV out3, RZ; +22:-:-:-:1 FSETP.GT.AND P0, PT, b4, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b5, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b6, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b7, RZ, PT; +--:-:-:-:1 @!P0 MOV out4, RZ; +--:-:-:-:1 @!P1 MOV out5, RZ; +--:-:-:-:1 @!P2 MOV out6, RZ; +--:-:-:-:1 @!P3 MOV out7, RZ; +--:-:-:-:5 R2P PR, preds, 0x0f; + } if $brelu; + + $out .= q{ +//delta *= ((x > 0) + slope * (x < 0)) +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +11:-:-:-:1 FSETP.GT.AND P0, PT, b0, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b1, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b2, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b3, RZ, PT; +--:-:-:-:1 SEL x0, one, RZ, P0; +--:-:-:-:1 SEL x1, one, RZ, P1; +--:-:-:-:1 SEL x2, one, RZ, P2; +--:-:-:-:1 SEL x3, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b0, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b1, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b2, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b3, RZ, PT; +--:-:-:-:1 SEL b0, one, RZ, P0; +--:-:-:-:1 SEL b1, one, RZ, P1; +--:-:-:-:1 SEL b2, one, RZ, P2; +--:-:-:-:1 SEL b3, one, RZ, P3; +--:-:-:-:1 FFMA b0, b0, param_beta, x0; +--:-:-:-:1 FFMA b1, b1, param_beta, x1; +--:-:-:-:1 FFMA b2, b2, param_beta, x2; +--:-:-:-:1 FFMA b3, b3, param_beta, x3; +--:-:-:-:1 FMUL out0, out0, b0; +--:-:-:-:1 FMUL out1, out1, b1; +--:-:-:-:1 FMUL out2, out2, b2; +--:-:-:-:1 FMUL out3, out3, b3; +22:-:-:-:1 FSETP.GT.AND P0, PT, b4, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b5, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b6, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b7, RZ, PT; +--:-:-:-:1 SEL x4, one, RZ, P0; +--:-:-:-:1 SEL x5, one, RZ, P1; +--:-:-:-:1 SEL x6, one, RZ, P2; +--:-:-:-:1 SEL x7, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b4, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b5, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b6, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b7, RZ, PT; +--:-:-:-:1 SEL b4, one, RZ, P0; +--:-:-:-:1 SEL b5, one, RZ, P1; +--:-:-:-:1 SEL b6, one, RZ, P2; +--:-:-:-:1 SEL b7, one, RZ, P3; +--:-:-:-:1 R2P PR, preds, 0x0f; +--:-:-:-:1 FFMA b4, b4, param_beta, x4; +--:-:-:-:1 FFMA b5, b5, param_beta, x5; +--:-:-:-:1 FFMA b6, b6, param_beta, x6; +--:-:-:-:1 FFMA b7, b7, param_beta, x7; +--:-:-:-:1 FMUL out4, out4, b4; +--:-:-:-:1 FMUL out5, out5, b5; +--:-:-:-:1 FMUL out6, out6, b6; +--:-:-:-:1 FMUL out7, out7, b7; + } if $bprelu; + + $out .= q{ +--:-:-:-:1 @!P2 MOV sum0, RZ; +--:-:-:-:1 @!P3 MOV sum2, RZ; +01:-:-:-:1 @P2 FADD sum0, out0, out1; +--:-:-:-:1 @P2 FADD sum1, out2, out3; +02:-:-:-:1 @P3 FADD sum2, out4, out5; +--:-:-:-:1 @P3 FADD sum3, out6, out7; +--:-:-:-:1 @P2 FADD sum0, sum0, sum1; +--:-:-:-:1 @P3 FADD sum2, sum2, sum3; +--:-:-:-:1 FADD sum0, sum0, sum2; + } if $bsum; + + $out .= q{ + +01:-:-:-:1 @P2 F2F.F16.F32 out0, out0; +--:-:-:-:1 @P2 F2F.F16.F32 out1, out1; +--:-:-:-:1 @P2 F2F.F16.F32 out2, out2; +--:-:1:-:1 @P2 F2F.F16.F32 out3, out3; +02:-:-:-:1 @P2 F2F.F16.F32 out4, out4; +--:-:-:-:1 @P2 F2F.F16.F32 out5, out5; +--:-:-:-:1 @P2 F2F.F16.F32 out6, out6; +--:-:2:-:1 @P2 F2F.F16.F32 out7, out7; + + } if $half; + + $out .= q{ + + }; + + $out .= $half ? qq{ + +--:-:-:-:1 LEA Out0.CC, offset, param_O[0], $addr_shift; +--:-:-:-:1 LEA.HI.X Out1, offset, param_O[1], RZ, $addr_shift; + +01:-:-:-:1 \@P2 BFI c0, out1, 0x1010, out0; +--:-:-:-:1 \@P2 BFI c1, out3, 0x1010, out2; +02:-:-:-:1 \@P2 BFI c2, out5, 0x1010, out4; +--:-:-:-:1 \@P2 BFI c3, out7, 0x1010, out6; + +--:5:-:-:1 \@P2 STG.E.CG.128 [Out], c0; + + } : qq{ + +--:-:-:-:1 LEA Out0.CC, offset, param_O[0], $addr_shift; +--:-:-:-:1 LEA.HI.X Out1, offset, param_O[1], RZ, $addr_shift; + +01:-:-:-:1 \@P2 STG.E.CG.128 [Out + 4x<00>], out0; +02:5:-:-:1 \@P3 STG.E.CG.128 [Out + 4x<$stepI>], out4; + + }; + + $out .= q{ + +--:-:-:-:1 XMAD.LO2C offset, k, param_gridMPQN, bsum_offset; +--:-:-:-:1 LEA Sum0.CC, offset, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, offset, param_Sum[1], RZ, 2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, k, param_K, P5; // k < K && tid31 == 0 + +--:-:2:-:2 SHFL.BFLY PT, sum1, sum0, 1, 0x1f; +02:-:-:-:4 FADD sum0, sum1, sum0; +--:-:2:-:2 SHFL.BFLY PT, sum1, sum0, 2, 0x1f; +02:-:-:-:4 FADD sum0, sum1, sum0; +--:-:2:-:2 SHFL.BFLY PT, sum1, sum0, 4, 0x1f; +02:-:-:-:2 FADD sum0, sum1, sum0; + +--:6:-:-:1 @P6 STG.E.CG [Sum], sum0; + + } if $bsum; + + $out .= q{ +--:-:-:-:5 RET; + }; + } + +-] diff --git a/Kernel/Convolution/Pascal/hconv_bprop_C1_N64.sass b/Kernel/Convolution/Pascal/hconv_bprop_C1_N64.sass new file mode 100644 index 0000000..fb00d82 --- /dev/null +++ b/Kernel/Convolution/Pascal/hconv_bprop_C1_N64.sass @@ -0,0 +1,663 @@ +# Kernel: hconv_bprop_C32_N64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert { return $convert; } + + our $dtype = $int16 ? 'S16' : 'U16'; + sub dtype { return $dtype; } +-] + + + addr_lut : 4x<64*4> + + param_I[0] : c[0x0][0x140] + param_I[1] : c[0x0][0x144] + param_E[0] : c[0x0][0x148] + param_E[1] : c[0x0][0x14c] + param_F[0] : c[0x0][0x150] + param_F[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_N : c[0x0][0x15c] + param_K : c[0x0][0x160] + param_D : c[0x0][0x164] + param_H : c[0x0][0x168] + param_W : c[0x0][0x16c] + param_WN : c[0x0][0x170] + param_HWN : c[0x0][0x174] + param_DHWN : c[0x0][0x178] + param_C : c[0x0][0x17c] + param_CRST : c[0x0][0x180] + param_RST : c[0x0][0x184] + param_magic_RST : c[0x0][0x188] + param_shift_RST : c[0x0][0x18c] + param_RS : c[0x0][0x190] + param_magic_RS : c[0x0][0x194] + param_shift_RS : c[0x0][0x198] + param_S : c[0x0][0x19c] + param_magic_S : c[0x0][0x1a0] + param_shift_S : c[0x0][0x1a4] + param_pad_d : c[0x0][0x1a8] + param_pad_h : c[0x0][0x1ac] + param_pad_w : c[0x0][0x1b0] + param_str_d : c[0x0][0x1b4] + param_str_h : c[0x0][0x1b8] + param_str_w : c[0x0][0x1bc] + param_Q : c[0x0][0x1c0] + param_PQ : c[0x0][0x1c4] + param_QN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_MPQN : c[0x0][0x1d0] + param_magic_Q : c[0x0][0x1d4] + param_shift_Q : c[0x0][0x1d8] + param_magic_PQ : c[0x0][0x1dc] + param_shift_PQ : c[0x0][0x1e0] + param_CRST8 : c[0x0][0x1e4] + param_MPQN8 : c[0x0][0x1e8] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkE, blkF, blkMPQ + + 68-119 ~ k<0|4>, tidFX, tidEX, tid1, tid7, m, p, q, crst, n, tf<0|4>, te, te<0|4>, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-79 : j0Ex<0-7>, j0Fy<0-7> + 80-95 : j1Ex<0-7>, j1Fy<0-7> + + 96-103 : load0F<0-3>, load4F<0-3> + 96-103 : store0F<0-3>, store4F<0-3> + + 104-107 : load0E<0-3> + 104-107 : store0E<0-3> + 112-115 : store0E<4-7> + + 108-111 : load4E<0-3> + 108-111 : store4E<0-3> + 112-115 : store4E<4-7> + + 116-119 : track0F<0-1>, track4F<0-1> + 120-123 : track0E<0-1>, track4E<0-1> + + 124-127 ~ writeEs, writeFs, swapBuf, K + 128-132 ~ readEs, readFs, mt, pr, qs + + 68-71 ~ lutStore, sliceI + 72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD + + 72-93 : c<0-7>, cs<0-3>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1> + 94-127 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkF, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; + + +// tidFX = (tid & 7) << 2 +// tidEX = (tid & 7) << 3 +// k = tid >> 3 +01:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHL tidFX, tid7, 2; +--:-:-:-:1 SHL tidEX, tid7, 3; +--:-:-:-:1 SHR.U32 k0, tid, 3; +--:-:-:-:1 IADD k4, k0, 4; + +--:-:-:-:1 MOV K, param_K; + +--:-:-:-:1 STS.128 [RZ], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15; + + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; + +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, q, param_str_w, RZ; +--:-:-:-:1 IADD mt, mt, -param_pad_d; +--:-:-:-:1 IADD pr, pr, -param_pad_h; +--:-:-:-:1 IADD qs, qs, -param_pad_w; + +// crst = blkF*32 + tidX +// n = blkE*64 + tidX +04:-:-:-:1 ISCADD crst, blkF, tidFX, 5; +08:-:-:-:1 ISCADD n, blkE, tidEX, 6; + +// trackF = k*CRST + crst +--:-:-:-:1 XMAD tf0, k0, param_CRST, crst; +--:-:-:-:1 XMAD tf4, k4, param_CRST, crst; +--:-:-:-:1 LEA track0F0.CC, tf0, param_F[0], 1; +--:-:-:-:1 LEA.HI.X track0F1, tf0, param_F[1], RZ, 1; +--:-:-:-:1 LEA track4F0.CC, tf4, param_F[0], 1; +--:-:-:-:1 LEA.HI.X track4F1, tf4, param_F[1], RZ, 1; + +// trackE = k*MPQN + m*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD te, q, param_N, n; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te0, k0, param_MPQN, te; +--:-:-:-:1 XMAD.LO2C te4, k4, param_MPQN, te; +--:-:-:-:1 LEA track0E0.CC, te0, param_E[0], 1; +--:-:-:-:1 LEA.HI.X track0E1, te0, param_E[1], RZ, 1; +--:-:-:-:1 LEA track4E0.CC, te4, param_E[0], 1; +--:-:-:-:1 LEA.HI.X track4E1, te4, param_E[1], RZ, 1; + +// P1 = crst < CRST +// P2 = n < N +// P3 = n+32 < N +--:-:-:-:1 ISETP.LT.AND P1, PT, crst, param_CRST, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, n, param_N, PT; + +// Remap the EX dim to avoid bank conflicts when storing to shared +// We can unmap this in the output + +// writeFs = (32*k + tidFX) * 4 +--:-:-:-:1 ISCADD writeFs, k0, tidFX, 5; +--:-:-:-:1 SHL writeFs, writeFs, 2; +// writeEs = (64*k + tidFX) * 4 (tidFX here not a bug) +--:-:-:-:1 ISCADD writeEs, k0, tidFX, 6; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x<32*8>, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readEs = ((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readEs, readEs, 4x<32*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, 4x<32*8 + 64*8>; + + +--:-:-:-:0 IADD K, K, -8; + +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F0, [track0F + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F1, [track0F + 2x<1>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F2, [track0F + 2x<2>]; +--:-:1:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F3, [track0F + 2x<3>]; + +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F0, [track4F + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F1, [track4F + 2x<1>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F2, [track4F + 2x<2>]; +--:-:2:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F3, [track4F + 2x<3>]; + +--:-:-:-:0 ISETP.GT.AND P1, PT, K, RZ, P1; + +--:-:3:-:1 @P2 LDG.E.128 load0E0, [track0E]; +--:-:4:-:1 @P2 LDG.E.128 load4E0, [track4E]; + +--:-:-:-:0 ISETP.GT.AND P2, PT, K, RZ, P2; + +01:-:-:-:1 [+ convert() +] store0F0, load0F0; +--:-:-:-:1 [+ convert() +] store0F1, load0F1; +--:-:-:-:1 [+ convert() +] store0F2, load0F2; +--:-:1:-:1 [+ convert() +] store0F3, load0F3; +--:-:-:-:6 IADD track0F0.CC, track0F0, param_CRST8; +--:-:-:-:0 IADD.X track0F1, track0F1, RZ; +01:-:-:-:1 STS.128 [writeFs + 4x<0*32>], store0F; + +02:-:-:-:1 [+ convert() +] store4F0, load4F0; +--:-:-:-:1 [+ convert() +] store4F1, load4F1; +--:-:-:-:1 [+ convert() +] store4F2, load4F2; +--:-:2:-:1 [+ convert() +] store4F3, load4F3; +--:-:-:-:6 IADD track4F0.CC, track4F0, param_CRST8; +--:-:-:-:0 IADD.X track4F1, track4F1, RZ; +02:-:-:-:1 STS.128 [writeFs + 4x<4*32>], store4F; + +04:-:-:-:1 [+ convert() +] store0E7, load0E3.H1; +--:-:-:-:1 [+ convert() +] store0E6, load0E3.H0; +--:-:-:-:1 [+ convert() +] store0E5, load0E2.H1; +--:-:1:-:1 [+ convert() +] store0E4, load0E2.H0; +--:-:-:-:1 [+ convert() +] store0E3, load0E1.H1; +--:-:-:-:1 [+ convert() +] store0E2, load0E1.H0; +--:-:-:-:1 [+ convert() +] store0E1, load0E0.H1; +--:-:2:-:1 [+ convert() +] store0E0, load0E0.H0; +--:-:-:-:6 IADD track0E0.CC, track0E0, param_MPQN8; +--:-:-:-:0 IADD.X track0E1, track0E1, RZ; +01:-:-:-:1 STS.128 [writeEs + 4x<0*64 + 32>], store0E4; +02:1:-:-:2 STS.128 [writeEs + 4x<0*64 + 0>], store0E0; + +09:-:-:-:1 [+ convert() +] store4E7, load4E3.H1; +--:-:-:-:1 [+ convert() +] store4E6, load4E3.H0; +--:-:-:-:1 [+ convert() +] store4E5, load4E2.H1; +--:-:1:-:1 [+ convert() +] store4E4, load4E2.H0; +--:-:-:-:1 [+ convert() +] store4E3, load4E1.H1; +--:-:-:-:1 [+ convert() +] store4E2, load4E1.H0; +--:-:-:-:1 [+ convert() +] store4E1, load4E0.H1; +--:-:2:-:1 [+ convert() +] store4E0, load4E0.H0; +--:-:-:-:6 IADD track4E0.CC, track4E0, param_MPQN8; +--:-:-:-:0 IADD.X track4E1, track4E1, RZ; +01:-:-:-:1 STS.128 [writeEs + 4x<4*64 + 32>], store4E4; +02:1:-:-:2 STS.128 [writeEs + 4x<4*64 + 0>], store4E0; + + +01:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD writeFs, writeFs, swapBuf; +--:-:-:-:2 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD K, K, -8; + +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>]; + +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F0, [track0F + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F1, [track0F + 2x<1>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F2, [track0F + 2x<2>]; +--:-:2:-:1 @P1 LDG.E.CI.[+ dtype() +] load0F3, [track0F + 2x<3>]; + +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F0, [track4F + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F1, [track4F + 2x<1>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F2, [track4F + 2x<2>]; +--:-:3:-:1 @P1 LDG.E.CI.[+ dtype() +] load4F3, [track4F + 2x<3>]; + +--:-:-:-:0 ISETP.GT.AND P1, PT, K, RZ, P1; + +--:-:4:-:1 @P2 LDG.E.128 load0E0, [track0E]; +--:-:5:-:1 @P2 LDG.E.128 load4E0, [track4E]; + +--:-:-:-:2 ISETP.GT.AND P2, PT, K, RZ, P2; + +NEXT_8K: +--:-:-:-:1 ISETP.GT.AND P0, PT, K, -8, PT; + +[+ + our $convert; + our $dtype; + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD K, K, -8;\n", + + j0c12 => "02:-:-:-:1 \@P0 $convert store0F0, load0F0;\n", + j0c16 => "--:-:-:-:1 \@P0 $convert store0F1, load0F1;\n", + j0c20 => "--:-:-:-:1 \@P0 $convert store0F2, load0F2;\n", + j0c24 => "--:-:2:-:1 \@P0 $convert store0F3, load0F3;\n", + j0c26 => "--:-:-:-:1 \@P0 IADD track0F0.CC, track0F0, param_CRST8;\n", + j0c31 => "--:-:-:-:1 \@P0 IADD.X track0F1, track0F1, RZ;\n", + j0c38 => "02:2:-:-:1 \@P0 STS.128 [writeFs + 4x<0*32>], store0F;\n", + j1c8 => "02:-:-:-:1 \@P1 LDG.E.CI.$dtype load0F0, [track0F + 2x<0>];\n", + j1c10 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype load0F1, [track0F + 2x<1>];\n", + j1c12 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype load0F2, [track0F + 2x<2>];\n", + j1c14 => "--:-:2:-:1 \@P1 LDG.E.CI.$dtype load0F3, [track0F + 2x<3>];\n", + + j2c12 => "04:-:-:-:1 \@P0 $convert store4F0, load4F0;\n", + j2c16 => "--:-:-:-:1 \@P0 $convert store4F1, load4F1;\n", + j2c20 => "--:-:-:-:1 \@P0 $convert store4F2, load4F2;\n", + j2c24 => "--:-:3:-:1 \@P0 $convert store4F3, load4F3;\n", + j2c26 => "--:-:-:-:1 \@P0 IADD track4F0.CC, track4F0, param_CRST8;\n", + j2c31 => "--:-:-:-:1 \@P0 IADD.X track4F1, track4F1, RZ;\n", + j2c38 => "04:3:-:-:1 \@P0 STS.128 [writeFs + 4x<4*32>], store4F;\n", + j3c8 => "04:-:-:-:1 \@P1 LDG.E.CI.$dtype load4F0, [track4F + 2x<0>];\n", + j3c10 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype load4F1, [track4F + 2x<1>];\n", + j3c12 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype load4F2, [track4F + 2x<2>];\n", + j3c14 => "--:-:3:-:1 \@P1 LDG.E.CI.$dtype load4F3, [track4F + 2x<3>];\n", + + j4c12 => "08:-:-:-:1 \@P0 $convert store0E7, load0E3.H1;\n", + j4c16 => "--:-:-:-:1 \@P0 $convert store0E6, load0E3.H0;\n", + j4c20 => "--:-:-:-:1 \@P0 $convert store0E5, load0E2.H1;\n", + j4c24 => "--:-:6:-:1 \@P0 $convert store0E4, load0E2.H0;\n", + j4c28 => "--:-:-:-:1 \@P0 $convert store0E3, load0E1.H1;\n", + j4c32 => "--:-:-:-:1 \@P0 $convert store0E2, load0E1.H0;\n", + j4c36 => "--:-:-:-:1 \@P0 $convert store0E1, load0E0.H1;\n", + j4c40 => "--:-:4:-:1 \@P0 $convert store0E0, load0E0.H0;\n", + j4c42 => "20:-:-:-:1 \@P0 STS.128 [writeEs + 4x<0*64 + 32>], store0E4;\n", + j4c44 => "--:-:-:-:1 \@P0 IADD track0E0.CC, track0E0, param_MPQN8;\n", + j4c49 => "--:-:-:-:1 \@P0 IADD.X track0E1, track0E1, RZ;\n", + j4c56 => "08:4:-:-:1 \@P0 STS.128 [writeEs + 4x<0*64 + 0>], store0E0;\n", + j5c8 => "08:-:4:-:1 \@P2 LDG.E.128 load0E0, [track0E];\n", + + j5c12 => "10:-:-:-:1 \@P0 $convert store4E7, load4E3.H1;\n", + j5c16 => "--:-:-:-:1 \@P0 $convert store4E6, load4E3.H0;\n", + j5c20 => "--:-:-:-:1 \@P0 $convert store4E5, load4E2.H1;\n", + j5c24 => "--:-:6:-:1 \@P0 $convert store4E4, load4E2.H0;\n", + j5c28 => "--:-:-:-:1 \@P0 $convert store4E3, load4E1.H1;\n", + j5c32 => "--:-:-:-:1 \@P0 $convert store4E2, load4E1.H0;\n", + j5c36 => "--:-:-:-:1 \@P0 $convert store4E1, load4E0.H1;\n", + j5c40 => "--:-:5:-:1 \@P0 $convert store4E0, load4E0.H0;\n", + j5c42 => "20:-:-:-:1 \@P0 STS.128 [writeEs + 4x<4*64 + 32>], store4E4;\n", + j5c44 => "--:-:-:-:1 \@P0 IADD track4E0.CC, track4E0, param_MPQN8;\n", + j5c49 => "--:-:-:-:1 \@P0 IADD.X track4E1, track4E1, RZ;\n", + j5c56 => "10:5:-:-:1 \@P0 STS.128 [writeEs + 4x<4*64 + 0>], store4E0;\n", + j6c8 => "10:-:5:-:1 \@P2 LDG.E.128 load4E0, [track4E];\n", + + j6c63 => "20:-:-:-:1 \@P0 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c8 => "--:-:-:-:1 ISETP.GT.AND P1, PT, K, RZ, P1;\n", + j7c10 => "--:-:-:-:1 ISETP.GT.AND P2, PT, K, RZ, PT;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_8K;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + my $barrier = $j == 6 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $barrier, $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2F|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +--:-:-:-:0 MOV warp_cnt, 32; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkF, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +01:-:-:-:6 MOV rst, tid; + +LUT_LOOP: + + +// warp synchronous loop while warp_cnt < RST (c=0) +--:-:-:-:1 ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT; +--:-:-:-:1 IADD warp_cnt, warp_cnt, 32; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +--:-:-:-:1 IADD z, mt, t; +--:-:-:-:1 IADD y, pr, r; +--:-:-:-:1 IADD x, qs, s; +// i = (z*HWN + y*WN + x*N) * 4 +20:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD sliceI, x, param_N, sliceI; +--:-:-:-:1 SHL sliceI, sliceI, 1; +// Bounds check x and y, and make i negative if outside +--:-:-:-:1 ISET.LT.AND x0, x, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW, x, param_W, PT; +--:-:-:-:1 ISET.LT.AND y0, y, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH, y, param_H, PT; +--:-:-:-:1 ISET.LT.AND z0, z, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD, z, param_D, PT; +--:-:-:-:1 LOP3.LUT sliceI, sliceI, x0, xW, 0xfe; + +--:-:-:-:1 LOP3.LUT sliceI, sliceI, y0, yH, 0xfe; +--:-:-:-:1 SHL lutStore, rst, 2; +--:-:-:-:1 IADD rst, rst, 32; + +--:-:-:-:1 LOP3.LUT sliceI, sliceI, z0, zD, 0xfe; +// Store i imgOffset into the shared lookup table +--:6:-:-:1 STS [lutStore + addr_lut], sliceI; + + +--:-:-:-:5 @P0 BRA.U LUT_LOOP; + + + +--:-:-:-:1 MOV RST, param_RST; +--:-:-:-:1 MOV DHWN1, param_DHWN; +--:-:-:-:1 SHL DHWN1, DHWN1, 1; + +--:-:-:-:1 LOP.AND readEs, readEs, 0x7f; +--:-:-:-:1 LOP.AND readFs, readFs, 0x3f; + +// Expand back out to undo our bank conflict avoiding stride +--:-:-:-:1 SHL readEs, readEs, 1; + +// writeCs = ((readIs / 4) * 64 + readEs) / 2; +--:-:-:-:1 ISCADD writeCs, readFs, readEs, 4; +--:-:-:-:1 SHR.U32 writeCs, writeCs, 1; + +// readCs = (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL readCs, tid31, 2; + +// nn = blkE*64 + tid31 << 1; +--:-:-:-:1 SHL tid31, tid31, 1; +04:-:-:-:1 ISCADD nn, blkE, tid31, 6; + +// crst = blkF*32 +02:-:-:-:1 SHL crst00, blkF, 5; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + +--:-:-:-:1 LEA trackI0.CC, nn, param_I[0], 1; +--:-:-:-:1 LEA.HI.X trackI1, nn, param_I[1], RZ, 1; + +// n < N +--:-:-:-:1 ISETP.LT.AND P5, PT, nn, param_N, PT; + +--:-:-:-:1 MOV alpha, param_alpha; + + + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:1 IADD crst00, crst00, 12;\n" . + "--:-:-:-:1 IADD crst04, crst04, 12;\n" . + "--:-:-:-:1 IADD crst08, crst08, 12;\n" . + "--:-:-:-:1 IADD crst12, crst12, 12;\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +// Round nearest +--:-:-:-:1 F2F.F16.F32 c0, c0; +--:-:1:-:1 F2F.F16.F32 c1, c1; +--:-:-:-:1 F2F.F16.F32 c2, c2; +--:-:2:-:1 F2F.F16.F32 c3, c3; +--:-:-:-:1 F2F.F16.F32 c4, c4; +--:-:3:-:1 F2F.F16.F32 c5, c5; +--:-:-:-:1 F2F.F16.F32 c6, c6; +--:-:4:-:1 F2F.F16.F32 c7, c7; + +// Pack 2 16 bit values into 32 bit words +11:-:-:-:2 BFI cs0, c1, 0x1010, c0; +02:-:-:-:2 BFI cs1, c3, 0x1010, c2; +24:-:-:-:2 BFI cs2, c5, 0x1010, c4; +08:-:-:-:0 BFI cs3, c7, 0x1010, c6; + +// Undo the stride in the X dim (items spaced by 32 are actually spaced 4) +--:-:-:-:4 STS.64 [writeCs+2x<0>], cs0; +--:-:-:-:1 STS.64 [writeCs+2x<4>], cs2; +--:-:-:-:1 LDS cs0, [readCs + 2x<0*64>]; +--:-:-:-:1 LDS cs1, [readCs + 2x<1*64>]; +--:-:-:-:1 LDS cs2, [readCs + 2x<2*64>]; +--:-:-:-:1 LDS cs3, [readCs + 2x<3*64>]; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; + +--:-:-:-:1 XMAD.LO2C c00, crst00, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c04, crst04, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c08, crst08, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c12, crst12, param_magic_RST, RZ; + +--:-:-:-:1 SHR.U32 c00, c00, param_shift_RST; +--:-:-:-:1 SHR.U32 c04, c04, param_shift_RST; +--:-:-:-:1 SHR.U32 c08, c08, param_shift_RST; +--:-:-:-:1 SHR.U32 c12, c12, param_shift_RST; + +--:-:-:-:1 VMAD.U16.U16 lut00, -c00, RST, crst00; +--:-:-:-:1 VMAD.U16.U16 lut04, -c04, RST, crst04; +--:-:-:-:1 VMAD.U16.U16 lut08, -c08, RST, crst08; +--:-:-:-:1 VMAD.U16.U16 lut12, -c12, RST, crst12; + +--:-:-:-:1 SHL lut00, lut00, 2; +--:-:-:-:1 SHL lut04, lut04, 2; +--:-:-:-:1 SHL lut08, lut08, 2; +--:-:-:-:1 SHL lut12, lut12, 2; + +--:-:-:-:1 XMAD.LO2 chan00, DHWN1, c00, RZ; +--:-:-:-:1 XMAD.LO2 chan04, DHWN1, c04, RZ; +--:-:-:-:1 XMAD.LO2 chan08, DHWN1, c08, RZ; +--:-:-:-:1 XMAD.LO2 chan12, DHWN1, c12, RZ; + +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 IADD crst12, crst12, 1; + +--:-:1:-:1 @P0 LDS img00, [lut00 + addr_lut]; +--:-:2:-:1 @P1 LDS img04, [lut04 + addr_lut]; +--:-:3:-:1 @P2 LDS img08, [lut08 + addr_lut]; +--:-:4:-:1 @P3 LDS img12, [lut12 + addr_lut]; + + + +01:-:-:-:1 IADD3 track00I0.CC, trackI0, img00, chan00; +--:-:-:-:5 ISETP.GE.AND P0, PT, img00, RZ, P0; +--:-:-:-:1 IADD.X track00I1, trackI1, RZ; + +02:-:-:-:1 IADD3 track04I0.CC, trackI0, img04, chan04; +--:-:-:-:5 ISETP.GE.AND P1, PT, img04, RZ, P1; +--:-:-:-:1 IADD.X track04I1, trackI1, RZ; + +04:-:-:-:1 IADD3 track08I0.CC, trackI0, img08, chan08; +--:-:-:-:5 ISETP.GE.AND P2, PT, img08, RZ, P2; +--:-:-:-:1 IADD.X track08I1, trackI1, RZ; + +08:-:-:-:1 IADD3 track12I0.CC, trackI0, img12, chan12; +--:-:-:-:5 ISETP.GE.AND P3, PT, img12, RZ, P3; +--:-:-:-:0 IADD.X track12I1, trackI1, RZ; + +--:-:-:-:2 @P0 RED.E.ADD.F16x2.FTZ.RN [track00I], cs0; +--:5:-:-:2 @P1 RED.E.ADD.F16x2.FTZ.RN [track04I], cs1; +--:-:-:-:4 @P2 RED.E.ADD.F16x2.FTZ.RN [track08I], cs2; +--:6:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [track12I], cs3; + +--:-:-:-:5 RET; + diff --git a/Kernel/Convolution/Pascal/hconv_updat_C128_K128.sass b/Kernel/Convolution/Pascal/hconv_updat_C128_K128.sass new file mode 100644 index 0000000..d6c9c15 --- /dev/null +++ b/Kernel/Convolution/Pascal/hconv_updat_C128_K128.sass @@ -0,0 +1,775 @@ +# Kernel: hconv_updat_C128_K128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + addr_zero : 4x<(128*16 + 32)*4> + addr_blkIE : 4x<(128*16 + 32)*4 + 4> + addr_q : 4x<(128*16 + 32)*4 + 6> + szBuf : (128*16 + 32) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_offset_K : c[0x0][0x15c] + param_N : c[0x0][0x160] + param_K : c[0x0][0x164] + param_D : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_WN : c[0x0][0x174] + param_HWN : c[0x0][0x178] + param_DHWN : c[0x0][0x17c] + param_C : c[0x0][0x180] + param_CRST : c[0x0][0x184] + param_RST : c[0x0][0x188] + param_magic_RST : c[0x0][0x18c] + param_shift_RST : c[0x0][0x190] + param_RS : c[0x0][0x194] + param_magic_RS : c[0x0][0x198] + param_shift_RS : c[0x0][0x19c] + param_S : c[0x0][0x1a0] + param_magic_S : c[0x0][0x1a4] + param_shift_S : c[0x0][0x1a8] + param_pad_d : c[0x0][0x1ac] + param_pad_h : c[0x0][0x1b0] + param_pad_w : c[0x0][0x1b4] + param_str_d : c[0x0][0x1b8] + param_str_h : c[0x0][0x1bc] + param_str_w : c[0x0][0x1c0] + param_dil_d : c[0x0][0x1c4] + param_dil_h : c[0x0][0x1c8] + param_dil_w : c[0x0][0x1cc] + param_P : c[0x0][0x1d0] + param_Q : c[0x0][0x1d4] + param_PQ : c[0x0][0x1d8] + param_QN : c[0x0][0x1dc] + param_PQN : c[0x0][0x1e0] + param_MPQN : c[0x0][0x1e4] + param_magic_Q : c[0x0][0x1e8] + param_shift_Q : c[0x0][0x1ec] + param_magic_PQ : c[0x0][0x1f0] + param_shift_PQ : c[0x0][0x1f4] + param_grid_P : c[0x0][0x1f8] + param_grid_Q : c[0x0][0x1fc] + param_grid_PQ : c[0x0][0x200] + param_CRSTK : c[0x0][0x204] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-65 : one + 64-65 : blkIE<0-1> + 64-68 : blkI, blkE, tid, tidX, tidY + 69-95 ~ blkMPQ, tid1, tid7, tid128, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 69-95 ~ c, z, y, x, k, te, mt, pr, qs, r, s, t, rs, rst, crst, ti, xw, xW, yh, yH, zd, zD, cC, nextP, nextQ, Q + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 96-99 : loadI<0-3> + 96-99 : storeI<0-3> + 100-103 : loadI<4-7> + 112-115 : storeI<4-7> + + 104-107 : loadE<0-3> + 104-107 : storeE<0-3> + 108-111 : loadE<4-7> + 112-115 : storeE<4-7> + + 116-119 : trackI<0-1>, trackE<0-1> + + 120-124 ~ writeS, loopN, m, p, q + 125-127 ~ readIs, readEs, swapBuf + + 72-87 : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 88-124 ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128, blk_MPQ, CRSTK, xmad_determ + + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID1; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkI, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; +--:-:-:-:5 BRA.U END_CTAID1; +CTAID1: +--:-:2:-:1 S2R blkMPQ, SR_CTAID.Z; +--:-:3:-:1 S2R blkI, SR_CTAID.X; +--:-:4:-:1 S2R blkE, SR_CTAID.Y; +END_CTAID1: + + +// tidX = tid >> 1 +// tidY = (tid & 1) << 3 +// shiftX = (tid & 1) << 4 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tid1, 3; +--:-:-:-:1 SHL shiftX, tid1, 4; + +0c:-:-:-:1 STS.64 [addr_blkIE], blkIE; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; ++] + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_grid_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_grid_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; + +--:-:-:-:1 STS [addr_q], q; + +// writeS = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 IADD writeS, writeS, shiftX; +--:-:-:-:1 ISCADD writeS, writeS, 4x, 2; + +// readIs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readIs, tid, 0x70; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readEs, tid128, 4; +--:-:-:-:1 LOP.OR readEs, readEs, tid7; +--:-:-:-:1 ISCADD readEs, readEs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + +--:-:-:-:1 MOV loopN, RZ; + +// Flag for first load branch +--:-:-:-:1 PSETP.AND.AND P0, PT, PT, PT, PT; + + + +NEXT_PQ: + +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 LDS.U.64 blkIE, [addr_blkIE]; + + +// Zigzag q but only if grid_P < P +--:-:-:-:1 LOP.AND.NZ P1, RZ, p, 1; +--:-:-:-:1 MOV Q, param_grid_P; +--:-:-:-:1 ISETP.LT.AND P1, PT, Q, param_P, P1; +--:-:-:-:1 MOV Q, -1; +--:-:-:-:1 @P1 IADD3 Q, -q, param_Q, Q; +--:-:-:-:1 @!P1 MOV Q, q; +// tidX = tid >> 1 +// tidY = (tid & 1) << 3 +--:-:-:-:1 LOP.AND tidY, tid, 1; +02:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tidY, 3; +// crst = blockI*128 + tid +04:-:-:-:1 ISCADD crst, blkI, tidX, 7; +// k = blockE*128 + tid +04:-:-:-:1 ISCADD k, blkE, tidX, 7; +--:-:-:-:1 IADD k, k, param_offset_K; + +// c = crst / RST +// rst = crst % RST +--:-:-:-:1 XMAD.LO2C c, crst, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c, c, param_shift_RST; +--:-:-:-:1 XMAD rst, c, param_RST, RZ; +--:-:-:-:1 IADD rst, -rst, crst; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = q * v - pad_w + (s * dil_w) +// y = p * u - pad_h + (r * dil_h) +// z = m * w - pad_d + (t * dil_d) +--:-:-:-:1 XMAD qs, Q, param_str_w, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 IADD x, x, -param_pad_w; +--:-:-:-:1 IADD y, y, -param_pad_h; +--:-:-:-:1 IADD z, z, -param_pad_d; +// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY +--:-:-:-:1 XMAD ti, x, param_N, tidY; +--:-:-:-:1 XMAD.LO2C ti, y, param_WN, ti; +--:-:-:-:1 XMAD.LO2C ti, z, param_HWN, ti; +--:-:-:-:1 XMAD.LO2C ti, c, param_DHWN, ti; +--:-:-:-:1 LEA trackI0.CC, ti, param_I[0], 1; +--:-:-:-:1 LEA.HI.X trackI1, ti, param_I[1], RZ, 1; +// trackE = k*MPQN + m*PQN + p*QN + tidY +--:-:-:-:1 XMAD te, Q, param_N, tidY; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te, k, param_MPQN, te; +--:-:-:-:1 LEA trackE0.CC, te, param_E[0], 1; +--:-:-:-:1 LEA.HI.X trackE1, te, param_E[1], RZ, 1; +// Bounds check x,y,z,c for each I track. +// If out of bounds, this will set the track address to -1 +--:-:-:-:1 ISET.GE.AND cC, c, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd, z, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD, z, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh, y, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH, y, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw, x, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW, x, param_W, PT; +--:-:-:-:1 LOP.OR trackI0, trackI0, cC; +--:-:-:-:1 LOP3.LUT trackI0, trackI0, zd, zD, 0xfe; +--:-:-:-:1 LOP3.LUT trackI0, trackI0, yh, yH, 0xfe; +--:-:-:-:1 LOP3.LUT trackI0, trackI0, xw, xW, 0xfe; + +01:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:0 ISETP.NE.AND P2, PT, trackI0, -1, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, k, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, nextP, param_P, PT; +--:-:-:-:1 IADD loopN, loopN, param_N; + + +--:-:-:Y:5 @P0 BRA.U FIRST_LOAD; + +INIT_LOOP: + +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*128 + 64>]; +--:-:1:-:2 LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + +NEXT_16N: + +[+ + + our $convert; + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD loopN, loopN, -16;\n", + + # p0 = (N & 16) == 0 + # p1 = N >= 32 && p0 + j0c14 => "--:-:-:-:1 LOP.AND.NZ P0, RZ, loopN, 16;\n", + j0c28 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 32, P0;\n", + + j2c34 => "--:-:-:-:1 \@P0 $convert storeI7, loadI7.H1;\n", + j2c38 => "--:-:-:-:1 \@P0 $convert storeI6, loadI7.H0;\n", + j2c42 => "--:-:-:-:1 \@P0 $convert storeI5, loadI6.H1;\n", + j2c46 => "--:-:-:-:1 \@P0 $convert storeI4, loadI6.H0;\n", + j2c50 => "--:-:-:-:1 \@P0 $convert storeI3, loadI5.H1;\n", + j2c54 => "--:-:-:-:1 \@P0 $convert storeI2, loadI5.H0;\n", + j2c58 => "--:-:-:-:1 \@P0 $convert storeI1, loadI4.H1;\n", + j2c62 => "--:-:-:-:1 \@P0 $convert storeI0, loadI4.H0;\n", + + j3c34 => "02:-:-:-:1 \@!P0 $convert storeI7, loadI3.H1;\n", + j3c38 => "--:-:-:-:1 \@!P0 $convert storeI6, loadI3.H0;\n", + j3c42 => "--:-:-:-:1 \@!P0 $convert storeI5, loadI2.H1;\n", + j3c46 => "--:-:5:-:1 \@!P0 $convert storeI4, loadI2.H0;\n", + j3c50 => "--:-:-:-:1 \@!P0 $convert storeI3, loadI1.H1;\n", + j3c54 => "--:-:-:-:1 \@!P0 $convert storeI2, loadI1.H0;\n", + j3c58 => "--:-:-:-:1 \@!P0 $convert storeI1, loadI0.H1;\n", + j3c62 => "--:-:2:-:1 \@!P0 $convert storeI0, loadI0.H0;\n", + + j4c8 => "10:-:-:-:1 STS [writeS + 4x<7*128>], storeI7;\n", + j4c10 => "--:-:-:-:1 STS [writeS + 4x<6*128>], storeI6;\n", + j4c12 => "--:-:-:-:1 STS [writeS + 4x<5*128>], storeI5;\n", + j4c14 => "--:-:-:-:1 STS [writeS + 4x<4*128>], storeI4;\n", + j4c16 => "02:-:-:-:1 STS [writeS + 4x<3*128>], storeI3;\n", + j4c18 => "--:-:-:-:1 STS [writeS + 4x<2*128>], storeI2;\n", + j4c20 => "--:-:-:-:1 STS [writeS + 4x<1*128>], storeI1;\n", + j4c22 => "--:2:-:-:1 STS [writeS + 4x<0*128>], storeI0;\n", + + j4c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, trackI0, -1, P1;\n", + j4c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, trackI0, -1, P1;\n", + + j5c8 => "02:-:-:-:1 \@P2 LDG.E.CI.128 loadI0, [trackI + 2x< 0>];\n", + j5c10 => "--:5:2:-:1 \@P2 LDG.E.CI.128 loadI4, [trackI + 2x<16>];\n", + + j6c8 => "--:-:-:-:1 \@P3 LDS.U.128 loadI0, [addr_zero];\n", + j7c8 => "--:-:-:-:1 \@P3 LDS.U.128 loadI4, [addr_zero];\n", + + j7c57 => "10:-:-:-:1 \@P2 IADD trackI0.CC, trackI0, 2x<32>;\n", + j7c63 => "--:-:-:-:1 \@P2 IADD.X trackI1, trackI1, RZ;\n", + + + j10c34 => "--:-:-:-:1 \@P0 $convert storeE7, loadE7.H1;\n", + j10c38 => "--:-:-:-:1 \@P0 $convert storeE6, loadE7.H0;\n", + j10c42 => "--:-:-:-:1 \@P0 $convert storeE5, loadE6.H1;\n", + j10c46 => "--:-:-:-:1 \@P0 $convert storeE4, loadE6.H0;\n", + j10c50 => "--:-:-:-:1 \@P0 $convert storeE3, loadE5.H1;\n", + j10c54 => "--:-:-:-:1 \@P0 $convert storeE2, loadE5.H0;\n", + j10c58 => "--:-:-:-:1 \@P0 $convert storeE1, loadE4.H1;\n", + j10c62 => "--:-:-:-:1 \@P0 $convert storeE0, loadE4.H0;\n", + + j11c34 => "04:-:-:-:1 \@!P0 $convert storeE7, loadE3.H1;\n", + j11c38 => "--:-:-:-:1 \@!P0 $convert storeE6, loadE3.H0;\n", + j11c42 => "--:-:-:-:1 \@!P0 $convert storeE5, loadE2.H1;\n", + j11c46 => "--:-:5:-:1 \@!P0 $convert storeE4, loadE2.H0;\n", + j11c50 => "--:-:-:-:1 \@!P0 $convert storeE3, loadE1.H1;\n", + j11c54 => "--:-:-:-:1 \@!P0 $convert storeE2, loadE1.H0;\n", + j11c58 => "--:-:-:-:1 \@!P0 $convert storeE1, loadE0.H1;\n", + j11c62 => "--:-:3:-:1 \@!P0 $convert storeE0, loadE0.H0;\n", + + j12c8 => "10:-:-:-:1 STS [writeS + 4x<7*128 + szBuf>], storeE7;\n", + j12c10 => "--:-:-:-:1 STS [writeS + 4x<6*128 + szBuf>], storeE6;\n", + j12c12 => "--:-:-:-:1 STS [writeS + 4x<5*128 + szBuf>], storeE5;\n", + j12c14 => "--:-:-:-:1 STS [writeS + 4x<4*128 + szBuf>], storeE4;\n", + j12c16 => "04:-:-:-:1 STS [writeS + 4x<3*128 + szBuf>], storeE3;\n", + j12c18 => "--:-:-:-:1 STS [writeS + 4x<2*128 + szBuf>], storeE2;\n", + j12c20 => "--:-:-:-:1 STS [writeS + 4x<1*128 + szBuf>], storeE1;\n", + j12c22 => "--:3:-:-:1 STS [writeS + 4x<0*128 + szBuf>], storeE0;\n", + + j12c24 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P1, P4, PT;\n", + + j13c8 => "04:-:-:-:1 \@P2 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];\n", + j13c10 => "--:5:3:-:1 \@P2 LDG.E.CI.128 loadE4, [trackE + 2x<16>];\n", + + j15c57 => "10:-:-:-:1 \@P2 IADD trackE0.CC, trackE0, 2x<32>;\n", + j15c62 => "--:-:-:-:1 \@P2 IADD.X trackE1, trackE1, RZ;\n", + + # p0 = N >= 16 and not (N == 32 and (p or q)) + j14c8 => "--:-:-:-:1 ISETP.EQ.AND P0, PT, loopN, 32, PT;\n", + j14c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 16, PT;\n", + j14c22 => "--:-:-:-:1 PSETP.OR.AND P0, PT, P5, P6, P0;\n", + j14c35 => "--:-:-:-:1 PSETP.AND.AND P0, PT, !P0, P1, PT;\n", + + j14c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "20:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j15c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_16N;\n" . + "--:-:-:-:0 \@P5 IADD q, q, param_grid_Q;\n" . + "01:-:-:Y:5 \@P5 BRA.U NEXT_PQ;\n" . + "--:-:1:-:1 \@P6 LDS q, [addr_q];\n" . + "--:-:-:-:0 \@P6 IADD p, p, param_grid_P;\n" . + "--:-:-:Y:5 \@P6 BRA.U NEXT_PQ;\n" . + "--:-:-:Y:5 BRA.U FINISH;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 8 ? 0 : 1; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + ++] + +FIRST_LOAD: + +--:-:-:-:8 PSETP.AND.AND P0, PT, PT, PT, !PT; + +// p1 = N == 32 and (p or q) +--:-:-:-:0 ISETP.EQ.AND P1, PT, loopN, 32, PT; + +--:-:-:-:1 @P2 LDG.E.CI.128 loadI0, [trackI + 2x< 0>]; +--:-:1:-:1 @P2 LDG.E.CI.128 loadI4, [trackI + 2x<16>]; +--:-:-:-:1 @!P2 LDS.U.128 loadI0, [addr_zero]; +--:-:5:-:1 @!P2 LDS.U.128 loadI4, [addr_zero]; + +--:-:-:-:1 @P4 LDG.E.CI.128 loadE0, [trackE + 2x< 0>]; +--:-:2:-:1 @P4 LDG.E.CI.128 loadE4, [trackE + 2x<16>]; +--:-:-:-:1 @!P4 LDS.U.128 loadE0, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 loadE4, [addr_zero]; + +11:-:-:-:1 [+ convert() +] storeI7, loadI3.H1; +--:-:-:-:1 [+ convert() +] storeI6, loadI3.H0; +--:-:-:-:1 [+ convert() +] storeI5, loadI2.H1; +--:-:1:-:1 [+ convert() +] storeI4, loadI2.H0; +--:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:5:-:1 [+ convert() +] storeI0, loadI0.H0; + +--:-:-:-:1 PSETP.OR.AND P1, PT, P5, P6, P1; + +--:-:-:-:0 @P2 IADD trackI0.CC, trackI0, 2x<32>; + +01:-:-:-:1 STS [writeS + 4x<7*128>], storeI7; +--:-:-:-:1 STS [writeS + 4x<6*128>], storeI6; +--:-:-:-:1 STS [writeS + 4x<5*128>], storeI5; +--:-:-:-:1 STS [writeS + 4x<4*128>], storeI4; +10:-:-:-:1 STS [writeS + 4x<3*128>], storeI3; +--:-:-:-:1 STS [writeS + 4x<2*128>], storeI2; +--:-:-:-:1 STS [writeS + 4x<1*128>], storeI1; +--:1:-:-:2 STS [writeS + 4x<0*128>], storeI0; + +--:-:-:-:0 @P2 IADD.X trackI1, trackI1, RZ; + +23:-:-:-:1 [+ convert() +] storeE7, loadE3.H1; +--:-:-:-:1 [+ convert() +] storeE6, loadE3.H0; +--:-:-:-:1 [+ convert() +] storeE5, loadE2.H1; +--:-:2:-:1 [+ convert() +] storeE4, loadE2.H0; +--:-:-:-:1 [+ convert() +] storeE3, loadE1.H1; +--:-:-:-:1 [+ convert() +] storeE2, loadE1.H0; +--:-:-:-:1 [+ convert() +] storeE1, loadE0.H1; +--:-:6:-:1 [+ convert() +] storeE0, loadE0.H0; + +--:-:-:-:2 PSETP.AND.AND P5, PT, P5, P1, PT; +--:-:-:-:1 PSETP.AND.AND P6, PT, P6, P1, PT; +--:-:-:-:0 @P4 IADD trackE0.CC, trackE0, 2x<32>; + +02:-:-:-:1 STS [writeS + 4x<7*128 + szBuf>], storeE7; +--:-:-:-:1 STS [writeS + 4x<6*128 + szBuf>], storeE6; +--:-:-:-:1 STS [writeS + 4x<5*128 + szBuf>], storeE5; +--:-:-:-:1 STS [writeS + 4x<4*128 + szBuf>], storeE4; +20:-:-:-:1 STS [writeS + 4x<3*128 + szBuf>], storeE3; +--:-:-:-:1 STS [writeS + 4x<2*128 + szBuf>], storeE2; +--:-:-:-:1 STS [writeS + 4x<1*128 + szBuf>], storeE1; +--:1:-:-:1 STS [writeS + 4x<0*128 + szBuf>], storeE0; + +--:-:-:-:1 @P4 IADD.X trackE1, trackE1, RZ; + +--:-:-:-:1 IADD readEs, readEs, -swapBuf; +--:-:-:-:0 IADD readIs, readIs, -swapBuf; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:0 @P5 IADD q, q, param_grid_Q; +--:-:-:Y:5 @P5 BRA.U NEXT_PQ; +--:-:-:-:0 @P6 IADD p, p, param_grid_P; +--:-:-:Y:5 @P6 BRA.U NEXT_PQ; + +--:-:-:-:2 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:Y:5 BRA.U INIT_LOOP; + + +FINISH: + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID2; +--:-:2:-:1 S2R blkI, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.X; +--:-:-:-:5 BRA.U END_CTAID2; +CTAID2: +--:-:2:-:1 S2R blkI, SR_CTAID.X; +--:-:3:-:1 S2R blkE, SR_CTAID.Y; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.Z; +END_CTAID2: + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readEs, readEs, -4x; +--:-:-:-:1 @P0 IADD readIs, readIs, -swapBuf; +--:-:-:-:1 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 128 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 5; + +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +01:-:-:-:1 LOP.AND t128, tid, 128; + +// kk = tid31 | (t128 >> 2); +--:-:-:-:1 SHR.U32 kk, t128, 2; +--:-:-:-:1 LOP.OR kk, tid31, kk; + +// readCs = ((tid96 << 4) | kk) << 2; +--:-:-:-:1 SHL readCs, tid96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, kk; +--:-:-:-:1 SHL readCs, readCs, 2; + +// kk += blkE*128; +04:-:-:-:1 ISCADD kk, blkE, kk, 7; +--:-:-:-:1 IADD kk, kk, param_offset_K; + +// crst = blkI*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 crst00, tid96, 1; +02:-:-:-:1 ISCADD crst00, blkI, crst00, 7; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K4, K, 4; +--:-:-:-:1 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +--:-:-:-:1 VMAD.U16.U16 tf, crst00, K, kk; +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSTK, param_CRSTK; +08:-:-:-:1 XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ; + }; + } + return ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 0x2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 0x2; + +// kk < K +--:-:-:-:1 ISETP.LT.AND P5, PT, kk, param_K, PT; +--:-:-:-:1 IADD kk, kk, 64; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, PT; + +--:-:-:-:1 MOV alpha, param_alpha; + + + +--:-:-:-:6 IADD track04F0.CC, track00F0, K4; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:6 IADD track08F0.CC, track04F0, K4; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:6 IADD track12F0.CC, track08F0, K4; +--:-:-:-:0 IADD.X track12F1, track08F1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + +[+ + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD track00F0.CC, track00F0, K60;\n" . + "--:-:-:-:1 IADD crst00, crst00, 60;\n" . + "--:-:-:-:1 IADD.X track00F1, track00F1, RZ;\n" . + "--:-:-:-:5 IADD track04F0.CC, track04F0, K60;\n" . + "--:-:-:-:1 IADD crst04, crst04, 60;\n" . + "--:-:-:-:1 IADD.X track04F1, track04F1, RZ;\n" . + "--:-:-:-:5 IADD track08F0.CC, track08F0, K60;\n" . + "--:-:-:-:1 IADD crst08, crst08, 60;\n" . + "--:-:-:-:1 IADD.X track08F1, track08F1, RZ;\n" . + "--:-:-:-:5 IADD track12F0.CC, track12F0, K60;\n" . + "--:-:-:-:1 IADD crst12, crst12, 60;\n" . + "--:-:-:-:1 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL f0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL f1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL f2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL f3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL f4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL f5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL f6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL f7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + ++] + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +--:-:-:-:0 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], f0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], f4; + +--:-:1:-:1 LDS f0, [readCs + 4x<0*128 + 00>]; +--:-:2:-:1 LDS f2, [readCs + 4x<1*128 + 00>]; +--:-:3:-:1 LDS f4, [readCs + 4x<2*128 + 00>]; +--:-:4:-:a LDS f6, [readCs + 4x<3*128 + 00>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:-:-:-:1 @P0 STG.E.CG [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 STG.E.CG [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 STG.E.CG [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 STG.E.CG [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } + else + { + return q{ +01:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } ++] + +--:-:1:-:1 LDS f1, [readCs + 4x<0*128 + 64>]; +--:-:2:-:1 LDS f3, [readCs + 4x<1*128 + 64>]; +--:-:3:-:1 LDS f5, [readCs + 4x<2*128 + 64>]; +--:-:4:-:a LDS f7, [readCs + 4x<3*128 + 64>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:1:-:-:1 @P0 STG.E.CG [track00F + 4x<64>], f1; +02:2:-:-:1 @P1 STG.E.CG [track04F + 4x<64>], f3; +04:3:-:-:1 @P2 STG.E.CG [track08F + 4x<64>], f5; +08:4:-:-:1 @P3 STG.E.CG [track12F + 4x<64>], f7; + }; + } + else + { + return q{ +01:1:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], f1; +02:2:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], f3; +04:3:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], f5; +08:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], f7; + }; + } ++] + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Pascal/hconv_updat_C128_K64.sass b/Kernel/Convolution/Pascal/hconv_updat_C128_K64.sass new file mode 100644 index 0000000..a40fcb8 --- /dev/null +++ b/Kernel/Convolution/Pascal/hconv_updat_C128_K64.sass @@ -0,0 +1,860 @@ +# Kernel: hconv_updat_C128_K64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2> + szShareI : (128*16 + 32) + szShareE : (64*16 + 32) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_offset_K : c[0x0][0x15c] + param_N : c[0x0][0x160] + param_K : c[0x0][0x164] + param_D : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_WN : c[0x0][0x174] + param_HWN : c[0x0][0x178] + param_DHWN : c[0x0][0x17c] + param_C : c[0x0][0x180] + param_CRST : c[0x0][0x184] + param_RST : c[0x0][0x188] + param_magic_RST : c[0x0][0x18c] + param_shift_RST : c[0x0][0x190] + param_RS : c[0x0][0x194] + param_magic_RS : c[0x0][0x198] + param_shift_RS : c[0x0][0x19c] + param_S : c[0x0][0x1a0] + param_magic_S : c[0x0][0x1a4] + param_shift_S : c[0x0][0x1a8] + param_pad_d : c[0x0][0x1ac] + param_pad_h : c[0x0][0x1b0] + param_pad_w : c[0x0][0x1b4] + param_str_d : c[0x0][0x1b8] + param_str_h : c[0x0][0x1bc] + param_str_w : c[0x0][0x1c0] + param_dil_d : c[0x0][0x1c4] + param_dil_h : c[0x0][0x1c8] + param_dil_w : c[0x0][0x1cc] + param_P : c[0x0][0x1d0] + param_Q : c[0x0][0x1d4] + param_PQ : c[0x0][0x1d8] + param_QN : c[0x0][0x1dc] + param_PQN : c[0x0][0x1e0] + param_MPQN : c[0x0][0x1e4] + param_magic_Q : c[0x0][0x1e8] + param_shift_Q : c[0x0][0x1ec] + param_magic_PQ : c[0x0][0x1f0] + param_shift_PQ : c[0x0][0x1f4] + param_grid_P : c[0x0][0x1f8] + param_grid_Q : c[0x0][0x1fc] + param_grid_PQ : c[0x0][0x200] + param_CRSTK : c[0x0][0x204] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkI, blkE, one + 68-99 ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-72 ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q + 73-99 ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1> + 73-99 ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 100-131 : load0I<0-7>, load1I<0-7>, loadE<0-7>, storeX<0-7> + 132-137 : track0I<0-1>, track1I<0-1>, trackE<0-1> + + 138-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY + 165-167 ~ readIs, readEs, swapBuf + + 68-83 : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID1; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkI, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; +--:-:-:-:5 BRA.U END_CTAID1; +CTAID1: +--:-:2:-:1 S2R blkMPQ, SR_CTAID.Z; +--:-:3:-:1 S2R blkI, SR_CTAID.X; +--:-:4:-:1 S2R blkE, SR_CTAID.Y; +END_CTAID1: + + +// tidX = tid >> 1 +// tidY = (tid & 1) << 3 +// shiftX = (tid & 1) << 4 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tid1, 3; +--:-:-:-:1 SHL shiftX, tid1, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_grid_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_grid_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; +--:-:-:-:1 MOV qq, q; + +// writeIs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 7; +--:-:-:-:1 IADD writeIs, writeIs, shiftX; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +// writeEs = (tidY*64 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeEs, tidY, tidX, 6; +--:-:-:-:1 IADD writeEs, writeEs, shiftX; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x, 2; + +// readIs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readIs, tid, -16; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; +--:-:-:-:1 SHL readIs, readIs, 4; +// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +--:-:-:-:1 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readEs, readEs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + +// crst = blockI*128 + tid +04:-:-:-:1 ISCADD crst0, blkI, tidX, 7; +--:-:-:-:1 IADD crst1, crst0, 64; + +// k = blockE*64 + tid +08:-:-:-:1 ISCADD k, blkE, tidX, 6; +--:-:-:-:1 IADD k, k, param_offset_K; + +--:-:-:-:1 MOV loopN, RZ; + +// Flag for first load branch +--:-:-:-:1 PSETP.AND.AND P0, PT, PT, PT, PT; + + +NEXT_PQ: + + +// Zigzag q but only if grid_P < P +--:-:-:-:1 LOP.AND.NZ P1, RZ, p, 1; +--:-:-:-:1 MOV Q, param_grid_P; +--:-:-:-:1 ISETP.LT.AND P1, PT, Q, param_P, P1; +--:-:-:-:1 MOV Q, -1; +--:-:-:-:1 @P1 IADD3 Q, -q, param_Q, Q; +--:-:-:-:1 @!P1 MOV Q, q; +// c = crst / RST +// rst = crst % RST +--:-:-:-:1 XMAD.LO2C c0, crst0, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c0, c0, param_shift_RST; +--:-:-:-:1 XMAD rst0, c0, param_RST, RZ; +--:-:-:-:1 IADD rst0, -rst0, crst0; +--:-:-:-:1 XMAD.LO2C c1, crst1, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c1, c1, param_shift_RST; +--:-:-:-:1 XMAD rst1, c1, param_RST, RZ; +--:-:-:-:1 IADD rst1, -rst1, crst1; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t0, rst0, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t0, t0, param_shift_RS; +--:-:-:-:1 XMAD rs0, t0, param_RS, RZ; +--:-:-:-:1 IADD rs0, -rs0, rst0; +--:-:-:-:1 XMAD.LO2C t1, rst1, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t1, t1, param_shift_RS; +--:-:-:-:1 XMAD rs1, t1, param_RS, RZ; +--:-:-:-:1 IADD rs1, -rs1, rst1; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r0, rs0, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r0, r0, param_shift_S; +--:-:-:-:1 XMAD s0, r0, param_S, RZ; +--:-:-:-:1 IADD s0, -s0, rs0; +--:-:-:-:1 XMAD.LO2C r1, rs1, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r1, r1, param_shift_S; +--:-:-:-:1 XMAD s1, r1, param_S, RZ; +--:-:-:-:1 IADD s1, -s1, rs1; +// z = m * w - pad_d + t +// y = p * u - pad_h + r +// x = q * v - pad_w + s +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, Q, param_str_w, RZ; +--:-:-:-:1 XMAD z1, t1, param_dil_d, mt; +--:-:-:-:1 XMAD y1, r1, param_dil_h, pr; +--:-:-:-:1 XMAD x1, s1, param_dil_w, qs; +--:-:-:-:1 XMAD z0, t0, param_dil_d, mt; +--:-:-:-:1 XMAD y1, r0, param_dil_h, pr; +--:-:-:-:1 XMAD x1, s0, param_str_w, qs; +--:-:-:-:1 IADD z1, z1, -param_pad_d; +--:-:-:-:1 IADD y1, y1, -param_pad_h; +--:-:-:-:1 IADD x1, x1, -param_pad_w; +--:-:-:-:1 IADD z0, z0, -param_pad_d; +--:-:-:-:1 IADD y0, y0, -param_pad_h; +--:-:-:-:1 IADD x0, x0, -param_pad_w; + + +// Split blocks to fit inside of 36 registers + + +// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY +--:-:-:-:1 XMAD.LO2C ti0, c0, param_DHWN, tidY; +--:-:-:-:1 XMAD.LO2C ti0, z0, param_HWN, ti0; +--:-:-:-:1 XMAD.LO2C ti0, y0, param_WN, ti0; +--:-:-:-:1 XMAD ti0, x0, param_N, ti0; +--:-:-:-:1 XMAD.LO2C ti1, c1, param_DHWN, tidY; +--:-:-:-:1 XMAD.LO2C ti1, z1, param_HWN, ti1; +--:-:-:-:1 XMAD.LO2C ti1, y1, param_WN, ti1; +--:-:-:-:1 XMAD ti1, x1, param_N, ti1; +--:-:-:-:1 LEA track0I0.CC, ti0, param_I[0], 1; +--:-:-:-:1 LEA.HI.X track0I1, ti0, param_I[1], RZ, 1; +--:-:-:-:1 LEA track1I0.CC, ti1, param_I[0], 1; +--:-:-:-:1 LEA.HI.X track1I1, ti1, param_I[1], RZ, 1; + +// trackE = k*MPQN + m*PQN + p*QN + tidY +--:-:-:-:1 XMAD.LO2C te, k, param_MPQN, tidY; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD te, Q, param_N, te; +--:-:-:-:1 LEA trackE0.CC, te, param_E[0], 1; +--:-:-:-:0 LEA.HI.X trackE1, te, param_E[1], RZ, 1; + +// Bounds check x,y,z,c for each I track. +// If out of bounds, this will set the track address to -1 +--:-:-:-:1 ISET.GE.AND cC0, c0, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd0, z0, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD0, z0, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh0, y0, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH0, y0, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw0, x0, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW0, x0, param_W, PT; +--:-:-:-:1 LOP.OR track0I0, track0I0, cC0; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe; + +--:-:-:-:1 ISET.GE.AND cC1, c1, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd1, z1, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD1, z1, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh1, y1, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH1, y1, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw1, x1, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW1, x1, param_W, PT; +--:-:-:-:1 LOP.OR track1I0, track1I0, cC1; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:1 ISETP.NE.AND P2, PT, track0I0, -1, PT; +--:-:-:-:0 ISETP.NE.AND P3, PT, track1I0, -1, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, k, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:-:1 IADD loopN, loopN, param_N; + + +--:-:-:Y:5 @P0 BRA.U FIRST_LOAD; + +INIT_LOOP: + +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +--:-:1:-:2 LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + +NEXT_16N: + +[+ + + our $convert; + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD loopN, loopN, -16;\n", + + # p0 = (N & 16) == 0 + # p1 = N >= 32 && p0 + j0c14 => "--:-:-:-:1 LOP.AND.NZ P0, RZ, loopN, 16;\n", + j0c28 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 32, P0;\n", + + j0c34 => "--:-:-:-:1 \@P0 $convert storeX7, load0I7.H1;\n", + j0c38 => "--:-:-:-:1 \@P0 $convert storeX6, load0I7.H0;\n", + j0c42 => "--:-:-:-:1 \@P0 $convert storeX5, load0I6.H1;\n", + j0c46 => "--:-:-:-:1 \@P0 $convert storeX4, load0I6.H0;\n", + j0c50 => "--:-:-:-:1 \@P0 $convert storeX3, load0I5.H1;\n", + j0c54 => "--:-:-:-:1 \@P0 $convert storeX2, load0I5.H0;\n", + j0c58 => "--:-:-:-:1 \@P0 $convert storeX1, load0I4.H1;\n", + j0c62 => "--:-:-:-:1 \@P0 $convert storeX0, load0I4.H0;\n", + + j1c34 => "02:-:-:-:1 \@!P0 $convert storeX7, load0I3.H1;\n", + j1c38 => "--:-:-:-:1 \@!P0 $convert storeX6, load0I3.H0;\n", + j1c42 => "--:-:-:-:1 \@!P0 $convert storeX5, load0I2.H1;\n", + j1c46 => "--:-:5:-:1 \@!P0 $convert storeX4, load0I2.H0;\n", + j1c50 => "--:-:-:-:1 \@!P0 $convert storeX3, load0I1.H1;\n", + j1c54 => "--:-:-:-:1 \@!P0 $convert storeX2, load0I1.H0;\n", + j1c58 => "--:-:-:-:1 \@!P0 $convert storeX1, load0I0.H1;\n", + j1c62 => "--:-:2:-:1 \@!P0 $convert storeX0, load0I0.H0;\n", + + j2c8 => "10:-:-:-:1 STS [writeIs + 4x<7*128 + 0>], storeX7;\n", + j2c10 => "--:-:-:-:1 STS [writeIs + 4x<6*128 + 0>], storeX6;\n", + j2c12 => "--:-:-:-:1 STS [writeIs + 4x<5*128 + 0>], storeX5;\n", + j2c14 => "--:-:-:-:1 STS [writeIs + 4x<4*128 + 0>], storeX4;\n", + j2c16 => "02:-:-:-:1 STS [writeIs + 4x<3*128 + 0>], storeX3;\n", + j2c18 => "--:-:-:-:1 STS [writeIs + 4x<2*128 + 0>], storeX2;\n", + j2c20 => "--:-:-:-:1 STS [writeIs + 4x<1*128 + 0>], storeX1;\n", + j2c22 => "--:2:-:-:1 STS [writeIs + 4x<0*128 + 0>], storeX0;\n", + + j2c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, track0I0, -1, P1;\n", + j2c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n", + + j3c8 => "02:-:-:-:1 \@P2 LDG.E.CI.128 load0I0, [track0I + 2x< 0>];\n", + j3c10 => "--:5:2:-:1 \@P2 LDG.E.CI.128 load0I4, [track0I + 2x<16>];\n", + + j4c8 => "--:-:-:-:1 \@P3 LDS.U.128 load0I0, [addr_zero];\n", + j5c8 => "--:-:-:-:1 \@P3 LDS.U.128 load0I4, [addr_zero];\n", + + j5c57 => "10:-:-:-:1 \@P2 IADD track0I0.CC, track0I0, 2x<32>;\n", + j5c63 => "--:-:-:-:1 \@P2 IADD.X track0I1, track0I1, RZ;\n", + + + j5c34 => "--:-:-:-:1 \@P0 $convert storeX7, load1I7.H1;\n", + j5c38 => "--:-:-:-:1 \@P0 $convert storeX6, load1I7.H0;\n", + j5c42 => "--:-:-:-:1 \@P0 $convert storeX5, load1I6.H1;\n", + j5c46 => "--:-:-:-:1 \@P0 $convert storeX4, load1I6.H0;\n", + j5c50 => "--:-:-:-:1 \@P0 $convert storeX3, load1I5.H1;\n", + j5c54 => "--:-:-:-:1 \@P0 $convert storeX2, load1I5.H0;\n", + j5c58 => "--:-:-:-:1 \@P0 $convert storeX1, load1I4.H1;\n", + j5c62 => "--:-:-:-:1 \@P0 $convert storeX0, load1I4.H0;\n", + + j6c34 => "04:-:-:-:1 \@!P0 $convert storeX7, load1I3.H1;\n", + j6c38 => "--:-:-:-:1 \@!P0 $convert storeX6, load1I3.H0;\n", + j6c42 => "--:-:-:-:1 \@!P0 $convert storeX5, load1I2.H1;\n", + j6c46 => "--:-:5:-:1 \@!P0 $convert storeX4, load1I2.H0;\n", + j6c50 => "--:-:-:-:1 \@!P0 $convert storeX3, load1I1.H1;\n", + j6c54 => "--:-:-:-:1 \@!P0 $convert storeX2, load1I1.H0;\n", + j6c58 => "--:-:-:-:1 \@!P0 $convert storeX1, load1I0.H1;\n", + j6c62 => "--:-:3:-:1 \@!P0 $convert storeX0, load1I0.H0;\n", + + j7c8 => "10:-:-:-:1 STS [writeIs + 4x<7*128 + 64>], storeX7;\n", + j7c10 => "--:-:-:-:1 STS [writeIs + 4x<6*128 + 64>], storeX6;\n", + j7c12 => "--:-:-:-:1 STS [writeIs + 4x<5*128 + 64>], storeX5;\n", + j7c14 => "--:-:-:-:1 STS [writeIs + 4x<4*128 + 64>], storeX4;\n", + j7c16 => "04:-:-:-:1 STS [writeIs + 4x<3*128 + 64>], storeX3;\n", + j7c18 => "--:-:-:-:1 STS [writeIs + 4x<2*128 + 64>], storeX2;\n", + j7c20 => "--:-:-:-:1 STS [writeIs + 4x<1*128 + 64>], storeX1;\n", + j7c22 => "--:3:-:-:1 STS [writeIs + 4x<0*128 + 64>], storeX0;\n", + + j7c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, track1I0, -1, P1;\n", + j7c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n", + + j8c8 => "04:-:-:-:1 \@P2 LDG.E.CI.128 load1I0, [track1I + 2x< 0>];\n", + j8c10 => "--:5:3:-:1 \@P2 LDG.E.CI.128 load1I4, [track1I + 2x<16>];\n", + + j9c8 => "--:-:-:-:1 \@P3 LDS.U.128 load1I0, [addr_zero];\n", + j10c8 => "--:-:-:-:1 \@P3 LDS.U.128 load1I4, [addr_zero];\n", + + j10c57 => "10:-:-:-:1 \@P2 IADD track1I0.CC, track1I0, 2x<32>;\n", + j10c63 => "--:-:-:-:1 \@P2 IADD.X track1I1, track1I1, RZ;\n", + + + j10c34 => "--:-:-:-:1 \@P0 $convert storeX7, loadE7.H1;\n", + j10c38 => "--:-:-:-:1 \@P0 $convert storeX6, loadE7.H0;\n", + j10c42 => "--:-:-:-:1 \@P0 $convert storeX5, loadE6.H1;\n", + j10c46 => "--:-:-:-:1 \@P0 $convert storeX4, loadE6.H0;\n", + j10c50 => "--:-:-:-:1 \@P0 $convert storeX3, loadE5.H1;\n", + j10c54 => "--:-:-:-:1 \@P0 $convert storeX2, loadE5.H0;\n", + j10c58 => "--:-:-:-:1 \@P0 $convert storeX1, loadE4.H1;\n", + j10c62 => "--:-:-:-:1 \@P0 $convert storeX0, loadE4.H0;\n", + + j11c34 => "08:-:-:-:1 \@!P0 $convert storeX7, loadE3.H1;\n", + j11c38 => "--:-:-:-:1 \@!P0 $convert storeX6, loadE3.H0;\n", + j11c42 => "--:-:-:-:1 \@!P0 $convert storeX5, loadE2.H1;\n", + j11c46 => "--:-:5:-:1 \@!P0 $convert storeX4, loadE2.H0;\n", + j11c50 => "--:-:-:-:1 \@!P0 $convert storeX3, loadE1.H1;\n", + j11c54 => "--:-:-:-:1 \@!P0 $convert storeX2, loadE1.H0;\n", + j11c58 => "--:-:-:-:1 \@!P0 $convert storeX1, loadE0.H1;\n", + j11c62 => "--:-:4:-:1 \@!P0 $convert storeX0, loadE0.H0;\n", + + j12c8 => "10:-:-:-:1 STS [writeEs + 4x<7*64>], storeX7;\n", + j12c10 => "--:-:-:-:1 STS [writeEs + 4x<6*64>], storeX6;\n", + j12c12 => "--:-:-:-:1 STS [writeEs + 4x<5*64>], storeX5;\n", + j12c14 => "--:-:-:-:1 STS [writeEs + 4x<4*64>], storeX4;\n", + j12c16 => "08:-:-:-:1 STS [writeEs + 4x<3*64>], storeX3;\n", + j12c18 => "--:-:-:-:1 STS [writeEs + 4x<2*64>], storeX2;\n", + j12c20 => "--:-:-:-:1 STS [writeEs + 4x<1*64>], storeX1;\n", + j12c22 => "--:4:-:-:1 STS [writeEs + 4x<0*64>], storeX0;\n", + + j12c24 => "--:-:-:-:1 ISETP.LT.AND P2, PT, k, param_K, P1;\n", + + j13c8 => "08:-:-:-:1 \@P2 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];\n", + j13c10 => "--:5:4:-:1 \@P2 LDG.E.CI.128 loadE4, [trackE + 2x<16>];\n", + + j15c57 => "10:-:-:-:1 \@P2 IADD trackE0.CC, trackE0, 2x<32>;\n", + j15c62 => "--:-:-:-:1 \@P2 IADD.X trackE1, trackE1, RZ;\n", + + # p0 = N >= 16 and not (N == 32 and (p or q)) + j14c8 => "--:-:-:-:1 ISETP.EQ.AND P0, PT, loopN, 32, PT;\n", + j14c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 16, PT;\n", + j14c22 => "--:-:-:-:1 PSETP.OR.AND P0, PT, P5, P6, P0;\n", + j14c35 => "--:-:-:-:1 PSETP.AND.AND P0, PT, !P0, P1, PT;\n", + + j14c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "20:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j15c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_16N;\n" . + "--:-:-:-:0 \@P5 IADD q, q, param_grid_Q;\n" . + "01:-:-:Y:5 \@P5 BRA.U NEXT_PQ;\n" . + "--:-:-:-:1 \@P6 MOV q, qq;\n" . + "--:-:-:-:0 \@P6 IADD p, p, param_grid_P;\n" . + "--:-:-:Y:5 \@P6 BRA.U NEXT_PQ;\n" . + "--:-:-:Y:5 BRA.U FINISH;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 8 ? 0 : 1; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + ++] + +FIRST_LOAD: + +--:-:-:-:8 PSETP.AND.AND P0, PT, PT, PT, !PT; + +--:-:-:-:1 @P2 LDG.E.CI.128 load0I0, [track0I + 2x< 0>]; +--:-:1:-:1 @P2 LDG.E.CI.128 load0I4, [track0I + 2x<16>]; +--:-:-:-:1 @!P2 LDS.U.128 load0I0, [addr_zero]; +--:-:4:-:1 @!P2 LDS.U.128 load0I4, [addr_zero]; + +// p1 = N == 32 and (p or q) +--:-:-:-:0 ISETP.EQ.AND P1, PT, loopN, 32, PT; + +--:-:-:-:1 @P3 LDG.E.CI.128 load1I0, [track1I + 2x< 0>]; +--:-:2:-:1 @P3 LDG.E.CI.128 load1I4, [track1I + 2x<16>]; +--:-:-:-:1 @!P3 LDS.U.128 load1I0, [addr_zero]; +--:-:5:-:1 @!P3 LDS.U.128 load1I4, [addr_zero]; + +--:-:-:-:1 @P4 LDG.E.CI.128 loadE0, [trackE + 2x< 0>]; +--:-:3:-:1 @P4 LDG.E.CI.128 loadE4, [trackE + 2x<16>]; +--:-:-:-:1 @!P4 LDS.U.128 loadE0, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 loadE4, [addr_zero]; + + +09:-:-:-:1 [+ convert() +] storeX7, load0I3.H1; +--:-:-:-:1 [+ convert() +] storeX6, load0I3.H0; +--:-:-:-:1 [+ convert() +] storeX5, load0I2.H1; +--:-:1:-:1 [+ convert() +] storeX4, load0I2.H0; +--:-:-:-:1 [+ convert() +] storeX3, load0I1.H1; +--:-:-:-:1 [+ convert() +] storeX2, load0I1.H0; +--:-:-:-:1 [+ convert() +] storeX1, load0I0.H1; +--:-:4:-:1 [+ convert() +] storeX0, load0I0.H0; + +--:-:-:-:1 PSETP.OR.AND P1, PT, P5, P6, P1; +--:-:-:-:0 @P2 IADD track0I0.CC, track0I0, 2x<32>; + +01:-:-:-:1 STS [writeIs + 4x<7*128 + 0>], storeX7; +--:-:-:-:1 STS [writeIs + 4x<6*128 + 0>], storeX6; +--:-:-:-:1 STS [writeIs + 4x<5*128 + 0>], storeX5; +--:-:-:-:1 STS [writeIs + 4x<4*128 + 0>], storeX4; +08:-:-:-:1 STS [writeIs + 4x<3*128 + 0>], storeX3; +--:-:-:-:1 STS [writeIs + 4x<2*128 + 0>], storeX2; +--:-:-:-:1 STS [writeIs + 4x<1*128 + 0>], storeX1; +--:1:-:-:2 STS [writeIs + 4x<0*128 + 0>], storeX0; + +--:-:-:-:0 @P2 IADD.X track0I1, track0I1, RZ; + +13:-:-:-:1 [+ convert() +] storeX7, load1I3.H1; +--:-:-:-:1 [+ convert() +] storeX6, load1I3.H0; +--:-:-:-:1 [+ convert() +] storeX5, load1I2.H1; +--:-:2:-:1 [+ convert() +] storeX4, load1I2.H0; +--:-:-:-:1 [+ convert() +] storeX3, load1I1.H1; +--:-:-:-:1 [+ convert() +] storeX2, load1I1.H0; +--:-:-:-:1 [+ convert() +] storeX1, load1I0.H1; +--:-:5:-:1 [+ convert() +] storeX0, load1I0.H0; + +--:-:-:-:1 PSETP.AND.AND P5, PT, P5, P1, PT; +--:-:-:-:0 @P3 IADD track1I0.CC, track1I0, 2x<32>; + +02:-:-:-:1 STS [writeIs + 4x<7*128 + 64>], storeX7; +--:-:-:-:1 STS [writeIs + 4x<6*128 + 64>], storeX6; +--:-:-:-:1 STS [writeIs + 4x<5*128 + 64>], storeX5; +--:-:-:-:1 STS [writeIs + 4x<4*128 + 64>], storeX4; +10:-:-:-:1 STS [writeIs + 4x<3*128 + 64>], storeX3; +--:-:-:-:1 STS [writeIs + 4x<2*128 + 64>], storeX2; +--:-:-:-:1 STS [writeIs + 4x<1*128 + 64>], storeX1; +--:1:-:-:1 STS [writeIs + 4x<0*128 + 64>], storeX0; + +--:-:-:-:1 PSETP.AND.AND P6, PT, P6, P1, PT; +--:-:-:-:0 @P3 IADD.X track1I1, track1I1, RZ; + +25:-:-:-:1 [+ convert() +] storeX7, loadE3.H1; +--:-:-:-:1 [+ convert() +] storeX6, loadE3.H0; +--:-:-:-:1 [+ convert() +] storeX5, loadE2.H1; +--:-:3:-:1 [+ convert() +] storeX4, loadE2.H0; +--:-:-:-:1 [+ convert() +] storeX3, loadE1.H1; +--:-:-:-:1 [+ convert() +] storeX2, loadE1.H0; +--:-:-:-:1 [+ convert() +] storeX1, loadE0.H1; +--:-:6:-:1 [+ convert() +] storeX0, loadE0.H0; + +--:-:-:-:0 @P4 IADD trackE0.CC, trackE0, 2x<32>; + +04:-:-:-:1 STS [writeEs + 4x<7*64>], storeX7; +--:-:-:-:1 STS [writeEs + 4x<6*64>], storeX6; +--:-:-:-:1 STS [writeEs + 4x<5*64>], storeX5; +--:-:-:-:1 STS [writeEs + 4x<4*64>], storeX4; +20:-:-:-:1 STS [writeEs + 4x<3*64>], storeX3; +--:-:-:-:1 STS [writeEs + 4x<2*64>], storeX2; +--:-:-:-:1 STS [writeEs + 4x<1*64>], storeX1; +--:1:-:-:1 STS [writeEs + 4x<0*64>], storeX0; + +--:-:-:-:1 @P4 IADD.X trackE1, trackE1, RZ; + +--:-:-:-:1 IADD readEs, readEs, -swapBuf; +--:-:-:-:0 IADD readIs, readIs, -swapBuf; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:0 @P5 IADD q, q, param_grid_Q; +--:-:-:Y:5 @P5 BRA.U NEXT_PQ; +--:-:-:-:0 @P6 IADD p, p, param_grid_P; +--:-:-:Y:5 @P6 BRA.U NEXT_PQ; + +--:-:-:-:2 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:Y:5 BRA.U INIT_LOOP; + + +FINISH: + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID2; +--:-:2:-:1 S2R blkI, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.X; +--:-:-:-:5 BRA.U END_CTAID2; +CTAID2: +--:-:2:-:1 S2R blkI, SR_CTAID.X; +--:-:3:-:1 S2R blkE, SR_CTAID.Y; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.Z; +END_CTAID2: + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readEs, readEs, -4x; +--:-:-:-:1 @P0 IADD readIs, readIs, -swapBuf; +--:-:-:-:1 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 64 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 4; + + +// readCs = ((tid & 96) << 3) | (tid & 31) +01:-:-:-:1 LOP.AND tid31, tid, 31; +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + + +// kk = blkE*64 + tid31; +04:-:-:-:1 ISCADD kk, blkE, tid31, 6; +--:-:-:-:1 IADD kk, kk, param_offset_K; + +// crst = blkI*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 crst00, tid96, 1; +02:-:-:-:1 ISCADD crst00, blkI, crst00, 7; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K4, K, 4; +--:-:-:-:1 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +--:-:-:-:1 VMAD.U16.U16 tf, crst00, K, kk; +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSTK, param_CRSTK; +08:-:-:-:1 XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ; + }; + } + return ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 2; + +--:-:-:-:1 MOV alpha, param_alpha; + +// kk < K +--:-:-:-:1 ISETP.LT.AND P5, PT, kk, param_K, PT; +--:-:-:-:1 IADD kk, kk, 32; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, PT; + + + +--:-:-:-:6 IADD track04F0.CC, track00F0, K4; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:6 IADD track08F0.CC, track04F0, K4; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:6 IADD track12F0.CC, track08F0, K4; +--:-:-:-:1 IADD.X track12F1, track08F1, RZ; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD track00F0.CC, track00F0, K60;\n" . + "--:-:-:-:1 IADD crst00, crst00, 60;\n" . + "--:-:-:-:1 IADD.X track00F1, track00F1, RZ;\n" . + "--:-:-:-:5 IADD track04F0.CC, track04F0, K60;\n" . + "--:-:-:-:1 IADD crst04, crst04, 60;\n" . + "--:-:-:-:1 IADD.X track04F1, track04F1, RZ;\n" . + "--:-:-:-:5 IADD track08F0.CC, track08F0, K60;\n" . + "--:-:-:-:1 IADD crst08, crst08, 60;\n" . + "--:-:-:-:1 IADD.X track08F1, track08F1, RZ;\n" . + "--:-:-:-:5 IADD track12F0.CC, track12F0, K60;\n" . + "--:-:-:-:1 IADD crst12, crst12, 60;\n" . + "--:-:-:-:1 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL f0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL f1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL f2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL f3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL f4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL f5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL f6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL f7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +--:-:-:-:0 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], f0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], f4; + +--:-:1:-:1 LDS f0, [readCs + 4x<0*64 + 00>]; +--:-:2:-:1 LDS f2, [readCs + 4x<1*64 + 00>]; +--:-:3:-:1 LDS f4, [readCs + 4x<2*64 + 00>]; +--:-:4:-:1 LDS f6, [readCs + 4x<3*64 + 00>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:-:-:-:1 @P0 STG.E.CG [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 STG.E.CG [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 STG.E.CG [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 STG.E.CG [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } + else + { + return q{ +01:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } ++] + +--:-:1:-:1 LDS f1, [readCs + 4x<0*64 + 32>]; +--:-:2:-:1 LDS f3, [readCs + 4x<1*64 + 32>]; +--:-:3:-:1 LDS f5, [readCs + 4x<2*64 + 32>]; +--:-:4:-:1 LDS f7, [readCs + 4x<3*64 + 32>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:1:-:-:1 @P0 STG.E.CG [track00F + 4x<32>], f1; +02:2:-:-:1 @P1 STG.E.CG [track04F + 4x<32>], f3; +04:3:-:-:1 @P2 STG.E.CG [track08F + 4x<32>], f5; +08:4:-:-:1 @P3 STG.E.CG [track12F + 4x<32>], f7; + }; + } + else + { + return q{ +01:1:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1; +02:2:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3; +04:3:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5; +08:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7; + }; + } ++] + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X128_N128.sass b/Kernel/Convolution/Pascal/hconv_xprop_X128_N128.sass new file mode 100644 index 0000000..71bae4b --- /dev/null +++ b/Kernel/Convolution/Pascal/hconv_xprop_X128_N128.sass @@ -0,0 +1,261 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 128; + our $shareF = 128; + our $stepI = 32; + our $stepF = 64; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + szShareF : (128*8) + szShareI : (128*8) + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<128*8*2 + 128*8*2 + 4> + addr_m : 4x<128*8*2 + 128*8*2 + 4> + addr_p : 4x<128*8*2 + 128*8*2 + 5> + addr_q : 4x<128*8*2 + 128*8*2 + 6> + addr_k : 4x<128*8*2 + 128*8*2 + 7> + addr_szLut : 4x<128*8*2 + 128*8*2 + 8> + addr_lut : 4x<128*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne + 72-111 ~ tid1, tid128, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + + 100-103 : loadI<0-1>, loadF<0-1> + 104-107 : storeI<0-3> + 104-107 : storeF<0-3> + + 108-111 ~ offsetF, offsetI, offsetFc, offsetIc + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 114-122 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset + 123-127 ~ readFs, readIs, tid, idx_N + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-122 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidX = (tid & 31) << 2 +// tidY = tid >> 5 +--:-:-:-:1 LOP.AND tidX, tid, 31; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 5; + +// trackF += blkF*128 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 7; + +// trackI += blkI*128 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 7; + +// writeS = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 SHL writeS, writeS, 2; + +// readFs = ((tid & 112) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 112; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7) +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 SHR.U32 tid128, tid128, 3; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid128; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.64 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.64 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.64 loadI, [addr_zero]; + +12:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:2:-:2 [+ convert() +] storeF0, loadF0.H0; + +02:1:-:-:2 STS.128 [writeS], storeF; + +25:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:2:-:2 [+ convert() +] storeI0, loadI0.H0; + +02:1:-:-:1 STS.128 [writeS + 4x], storeI; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.64 loadI, [trackI]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c33 => "02:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c41 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c45 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c60 => "02:2:-:-:1 \@P0 STS.128 [writeS], storeF;\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "20:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c40 => "02:-:2:-:1 \@P1 LDG.E.CI.64 loadF, [trackF];\n", + + + j5c45 => "04:-:-:-:1 \@P0 $convert storeI3, loadI1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert storeI2, loadI1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert storeI1, loadI0.H1;\n", + j5c57 => "--:-:3:-:1 \@P0 $convert storeI0, loadI0.H0;\n", + + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x], storeI;\n", + + j6c54 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j6c59 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j6c61 => "04:-:3:-:1 \@P1 LDG.E.64 loadI, [trackI];\n", + + j6c62 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; + + + +// tidOX = (tid & 7) << 3 + (tid & 128) >> 1 +// tidOY = (tid & 127) >> 3 +--:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 3; +--:-:-:-:1 LOP.AND tidOX2, tid, 128; +--:-:-:-:1 SHR.U32 tidOX2, tidOX2, 1; +--:-:-:-:1 LOP.OR tidOX, tidOX, tidOX2; +--:-:-:-:1 LOP.AND tidOY, tid, 127; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 LOP.AND readIs, readIs, 0x1ff; +--:-:-:-:1 LOP.AND readFs, readFs, 0x0ff; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +--:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*128 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 7; + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X128_N64.sass b/Kernel/Convolution/Pascal/hconv_xprop_X128_N64.sass new file mode 100644 index 0000000..ce64717 --- /dev/null +++ b/Kernel/Convolution/Pascal/hconv_xprop_X128_N64.sass @@ -0,0 +1,284 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 64; + our $shareF = 128; + our $stepI = 32; + our $stepF = 64; + our $remapF = 1; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + szShareF : (128*8) + szShareI : (64*8) + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 128*8*2 + 4> + addr_m : 4x<64*8*2 + 128*8*2 + 4> + addr_p : 4x<64*8*2 + 128*8*2 + 5> + addr_q : 4x<64*8*2 + 128*8*2 + 6> + addr_k : 4x<64*8*2 + 128*8*2 + 7> + addr_szLut : 4x<64*8*2 + 128*8*2 + 8> + addr_lut : 4x<64*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-109 ~ tid1, tid15, tidFX, tidIX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-109 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + + 100-103 : loadF<0-3> + 100-103 : storeF<0-3> + 104-107 : storeF<4-7> + + 108-109 : loadI<0-1> + 104-107 : storeI<0-3> + + 104-107 ~ offsetF + + 110-111 : sliceI, sliceF + 110-111 : sliceIF<0-1> + + 112-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc + 125-127 ~ readFs, readIs, swapBuf + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-124 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidFX = (tid & 15) << 3 +// tidIX = (tid & 15) << 2 +// tidY = tid >> 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidFX, tid15, 3; +--:-:-:-:1 SHL tidIX, tid15, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 4; + +// trackF += blkF*128 + tidFX + offset_K +--:-:-:-:1 ISCADD offsetFk, idx_K, tidFX, 7; + +// trackI += blkI*64 + tidIX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidIX, 6; + +// Remap the FX dim to avoid bank conflicts when storing to shared + +// writeFs = (128*tidY + tidIX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidIX, 7; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (64*tidY + tidIX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidIX, 6; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = ((tid & -16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = (tid >> 1) & 7 +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.128 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.64 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.64 loadI, [addr_zero]; + +12:-:-:-:1 [+ convert() +] storeF7, loadF3.H1; +--:-:-:-:1 [+ convert() +] storeF6, loadF3.H0; +--:-:-:-:1 [+ convert() +] storeF5, loadF2.H1; +--:-:1:-:1 [+ convert() +] storeF4, loadF2.H0; +--:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:2:-:1 [+ convert() +] storeF0, loadF0.H0; + +01:-:-:-:1 STS.128 [writeFs + 4x<64>], storeF4; +02:1:-:-:2 STS.128 [writeFs + 4x<00>], storeF0; + +25:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:3:-:2 [+ convert() +] storeI0, loadI0.H0; + +04:1:-:-:1 STS.128 [writeIs], storeI0; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.64 loadI, [trackI]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c29 => "02:-:-:-:1 \@P0 $convert storeF7, loadF3.H1;\n", + j1c33 => "--:-:-:-:1 \@P0 $convert storeF6, loadF3.H0;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF5, loadF2.H1;\n", + j1c41 => "--:-:5:-:1 \@P0 $convert storeF4, loadF2.H0;\n", + j1c45 => "--:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c49 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c53 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c57 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c59 => "10:5:-:-:1 \@P0 STS.128 [writeFs + 4x<64>], storeF4;\n", + j2c8 => "02:2:-:-:1 \@P0 STS.128 [writeFs + 4x<00>], storeF0;\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "30:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c40 => "02:-:2:-:1 \@P1 LDG.E.CI.128 loadF, [trackF];\n", + + j5c45 => "04:-:-:-:1 \@P0 $convert storeI3, loadI1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert storeI2, loadI1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert storeI1, loadI0.H1;\n", + j5c57 => "--:-:3:-:1 \@P0 $convert storeI0, loadI0.H0;\n", + + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeIs], storeI0;\n", + + j6c55 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j6c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j6c62 => "04:-:3:-:1 \@P1 LDG.E.64 loadI, [trackI];\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 3 +// tidOY = tid >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 3; +--:-:-:-:1 SHR.U32 tidOY, tid, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 64 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 6; + +// Mul by 4 here expands k stride back out +// Mul by 2 again to undo the bank conflict avoiding stride +// k = blkF*128 + tidOY * 8 +--:-:-:-:1 SHL tidOY, tidOY, 3; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 7; + +[+ output_setup(63, 0, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X32_N128.sass b/Kernel/Convolution/Pascal/hconv_xprop_X32_N128.sass new file mode 100644 index 0000000..e85f7d4 --- /dev/null +++ b/Kernel/Convolution/Pascal/hconv_xprop_X32_N128.sass @@ -0,0 +1,323 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 128; + our $shareF = 32; + our $stepI = 32; + our $stepF = 16; + our $remapI = 1; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + + addr_zero : 4x<32*8*2 + 128*8*2 + 0> + szShareF : (32*8) + szShareI : (128*8) + + addr_zero : 4x<32*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<32*8*2 + 128*8*2 + 4> + addr_m : 4x<32*8*2 + 128*8*2 + 4> + addr_p : 4x<32*8*2 + 128*8*2 + 5> + addr_q : 4x<32*8*2 + 128*8*2 + 6> + addr_k : 4x<32*8*2 + 128*8*2 + 7> + addr_szLut : 4x<32*8*2 + 128*8*2 + 8> + addr_lut : 4x<32*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-69 : m, p, q + 64-69 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne + 70-113 ~ tid1, tidIX, tidFX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 70-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + + 100-103 : load0I<0-3> + 100-103 : store0I<0-3> + 104-107 : store0I<4-7> + + 108-111 : load1I<0-3> + 108-111 : store1I<0-3> + 104-107 : store1I<4-7> + + 112-113 : loadF<0-1> + 104-107 : storeF<0-3> + + 114-115 : sliceI, sliceF + 114-115 : sliceIF<0-1> + + 116-140 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetF, offsetIc, offsetFc + 141-155 ~ readFs, readIs, swapBuf, tid, idx_N, tid7, tid1_7, tid32, tid32_1 + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-140 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] + +[+ get_mpqk() +] + +// tidIX = (tid & 7) << 3 +// tidFX = (tid & 7) << 2 + +// tidY = tid >> 3 +--:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHL tidIX, tid7, 3; +--:-:-:-:1 SHL tidFX, tid7, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 3; + +// trackF += blkF*32 + tidFX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidFX, 5; + +// trackI += blkI*128 + tidIX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidIX, 7; + +// writeFs = (32*tidY + tidFX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidFX, 5; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// Remap the IX dim to avoid bank conflicts when storing to shared + +// writeIs = (128*tidY + tidFX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidFX, 7; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = (((tid & 16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 32) >> 1) | ((tid >> 1) & 7) << 4 +--:-:-:-:1 LOP.AND tid32, tid, 32; +--:-:-:-:1 SHR.U32 tid32_1, tid32, 1; +--:-:-:-:1 BFE.U32 tid1_7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, tid1_7, tid32_1; +--:-:-:-:1 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.64 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 load0I, [trackI + 2x<00>]; +--:-:4:-:1 @P1 LDG.E.128 load1I, [trackI + 2x<64>]; +--:-:-:-:1 @!P1 LDS.U.128 load0I, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 load1I, [addr_zero]; + +12:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:2:-:2 [+ convert() +] storeF0, loadF0.H0; + +02:1:-:-:2 STS.128 [writeFs], storeF0; + +25:-:-:-:1 [+ convert() +] store0I7, load0I3.H1; +--:-:-:-:1 [+ convert() +] store0I6, load0I3.H0; +--:-:-:-:1 [+ convert() +] store0I5, load0I2.H1; +--:-:2:-:1 [+ convert() +] store0I4, load0I2.H0; +--:-:-:-:1 [+ convert() +] store0I3, load0I1.H1; +--:-:-:-:1 [+ convert() +] store0I2, load0I1.H0; +--:-:-:-:1 [+ convert() +] store0I1, load0I0.H1; +--:-:3:-:1 [+ convert() +] store0I0, load0I0.H0; + +02:-:-:-:1 STS.128 [writeIs + 4x<32>], store0I4; +04:1:-:-:2 STS.128 [writeIs + 4x<00>], store0I0; + +09:-:-:-:1 [+ convert() +] store1I7, load1I3.H1; +--:-:-:-:1 [+ convert() +] store1I6, load1I3.H0; +--:-:-:-:1 [+ convert() +] store1I5, load1I2.H1; +--:-:2:-:1 [+ convert() +] store1I4, load1I2.H0; +--:-:-:-:1 [+ convert() +] store1I3, load1I1.H1; +--:-:-:-:1 [+ convert() +] store1I2, load1I1.H0; +--:-:-:-:1 [+ convert() +] store1I1, load1I0.H1; +--:-:3:-:1 [+ convert() +] store1I0, load1I0.H0; + +02:-:-:-:1 STS.128 [writeIs + 4x<96>], store1I4; +04:1:-:-:1 STS.128 [writeIs + 4x<64>], store1I0; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.128 load0I, [trackI + 2x<00>]; +--:5:4:-:1 @P1 LDG.E.128 load1I, [trackI + 2x<64>]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c33 => "02:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c41 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c45 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c60 => "02:-:-:-:1 \@P0 STS.128 [writeFs], storeF0;\n", + + j1c62 => "--:-:2:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "02:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.64 loadF0, [trackF];\n", + + + j3c29 => "04:-:-:-:1 \@P0 $convert store0I7, load0I3.H1;\n", + j3c33 => "--:-:-:-:1 \@P0 $convert store0I6, load0I3.H0;\n", + j3c37 => "--:-:-:-:1 \@P0 $convert store0I5, load0I2.H1;\n", + j3c41 => "--:-:6:-:1 \@P0 $convert store0I4, load0I2.H0;\n", + j3c45 => "--:-:-:-:1 \@P0 $convert store0I3, load0I1.H1;\n", + j3c49 => "--:-:-:-:1 \@P0 $convert store0I2, load0I1.H0;\n", + j3c53 => "--:-:-:-:1 \@P0 $convert store0I1, load0I0.H1;\n", + j3c57 => "--:-:3:-:1 \@P0 $convert store0I0, load0I0.H0;\n", + + j3c59 => "20:-:-:-:1 \@P0 STS.128 [writeIs + 4x<32>], store0I4;\n", + j4c8 => "04:3:-:-:1 \@P0 STS.128 [writeIs + 4x<00>], store0I0;\n", + + j4c50 => "10:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j4c55 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j4c61 => "04:-:3:-:1 \@P1 LDG.E.128 load0I0, [trackI + 2x<00>];\n", + + + j5c29 => "08:-:-:-:1 \@P0 $convert store1I7, load1I3.H1;\n", + j5c33 => "--:-:-:-:1 \@P0 $convert store1I6, load1I3.H0;\n", + j5c37 => "--:-:-:-:1 \@P0 $convert store1I5, load1I2.H1;\n", + j5c41 => "--:-:6:-:1 \@P0 $convert store1I4, load1I2.H0;\n", + j5c45 => "--:-:-:-:1 \@P0 $convert store1I3, load1I1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert store1I2, load1I1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert store1I1, load1I0.H1;\n", + j5c57 => "--:-:4:-:1 \@P0 $convert store1I0, load1I0.H0;\n", + + j5c59 => "20:-:-:-:1 \@P0 STS.128 [writeIs + 4x<96>], store1I4;\n", + j6c8 => "08:4:-:-:1 \@P0 STS.128 [writeIs + 4x<64>], store1I0;\n", + + j6c61 => "08:5:4:-:1 \@P1 LDG.E.128 load1I0, [trackI + 2x<64>];\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; + + + +// tidOX = (tid & 7) << 3 + (tid & 32) << 1 +// tidOY = (tid & 31) >> 3 +--:-:-:-:1 SHL tid32, tid32, 1; +--:-:-:-:1 ISCADD tidOX, tid7, tid32, 3; +--:-:-:-:1 LOP.AND tidOY, tid, 31; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; + +// readIs = ((tid & 32) >> 1) | (((tid >> 1) & 7) << 1) << 4 +--:-:-:-:1 ISCADD readIs, tid1_7, tid32_1, 1; +--:-:-:-:1 SHL readIs, readIs, 4; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +--:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*32 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +--:-:-:-:1 ISCADD k, idx_K, tidOY, 5; + + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X64_N128.sass b/Kernel/Convolution/Pascal/hconv_xprop_X64_N128.sass new file mode 100644 index 0000000..38f8183 --- /dev/null +++ b/Kernel/Convolution/Pascal/hconv_xprop_X64_N128.sass @@ -0,0 +1,293 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 128; + our $shareF = 64; + our $stepI = 64; + our $stepF = 32; + our $remapI = 1; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} +-] + + + + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + szShareF : (64*8) + szShareI : (128*8) + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 128*8*2 + 4> + addr_m : 4x<64*8*2 + 128*8*2 + 4> + addr_p : 4x<64*8*2 + 128*8*2 + 5> + addr_q : 4x<64*8*2 + 128*8*2 + 6> + addr_k : 4x<64*8*2 + 128*8*2 + 7> + addr_szLut : 4x<64*8*2 + 128*8*2 + 8> + addr_lut : 4x<64*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-111 ~ tid1, tid15, tid64, tidIX, tidFX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + + 100-103 : loadI<0-3> + 100-103 : storeI<0-3> + 104-107 : storeI<4-7> + + 108-109 : loadF<0-1> + 104-107 : storeF<0-3> + + 110-111 : sliceI, sliceF + 110-111 : sliceIF<0-1> + + 108-109 ~ offsetF + + 112-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc + 125-127 ~ readFs, readIs, swapBuf + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-124 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidIX = (tid & 15) << 3 +// tidFX = (tid & 15) << 2 +// tidY = tid >> 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidIX, tid15, 3; +--:-:-:-:1 SHL tidFX, tid15, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 4; + +// trackF += blkF*64 + tidFX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidFX, 6; + +// trackI += blkI*128 + tidIX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidIX, 7; + +// writeFs = (64*tidY + tidFX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidFX, 6; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// Remap the IX dim to avoid bank conflicts when storing to shared + +// writeIs = (128*tidY + tidFX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidFX, 7; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = ((tid & 48) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 48; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 64) >> 3) | ((tid >> 1) & 7) +--:-:-:-:1 LOP.AND tid64, tid, 64; +--:-:-:-:1 SHR.U32 tid64, tid64, 3; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid64; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.64 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero]; + +12:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:2:-:2 [+ convert() +] storeF0, loadF0.H0; + +02:1:-:-:2 STS.128 [writeFs], storeF0; + +25:-:-:-:1 [+ convert() +] storeI7, loadI3.H1; +--:-:-:-:1 [+ convert() +] storeI6, loadI3.H0; +--:-:-:-:1 [+ convert() +] storeI5, loadI2.H1; +--:-:2:-:1 [+ convert() +] storeI4, loadI2.H0; +--:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:3:-:1 [+ convert() +] storeI0, loadI0.H0; + +02:-:-:-:1 STS.128 [writeIs + 4x<64>], storeI4; +04:1:-:-:1 STS.128 [writeIs + 4x<00>], storeI0; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.64 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.128 loadI, [trackI]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c33 => "02:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c41 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c45 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c60 => "02:2:-:-:1 \@P0 STS.128 [writeFs], storeF0;\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "22:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.64 loadF, [trackF];\n", + + + j5c29 => "04:-:-:-:1 \@P0 $convert storeI7, loadI3.H1;\n", + j5c33 => "--:-:-:-:1 \@P0 $convert storeI6, loadI3.H0;\n", + j5c37 => "--:-:-:-:1 \@P0 $convert storeI5, loadI2.H1;\n", + j5c41 => "--:-:6:-:1 \@P0 $convert storeI4, loadI2.H0;\n", + j5c45 => "--:-:-:-:1 \@P0 $convert storeI3, loadI1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert storeI2, loadI1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert storeI1, loadI0.H1;\n", + j5c57 => "--:-:3:-:1 \@P0 $convert storeI0, loadI0.H0;\n", + + j5c59 => "20:-:-:-:1 \@P0 STS.128 [writeIs + 4x<64>], storeI4;\n", + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeIs + 4x<00>], storeI0;\n", + + j6c55 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j6c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j6c62 => "04:-:3:-:1 \@P1 LDG.E.128 loadI, [trackI];\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 3 + (tid & 64) +// tidOY = (tid & 63) >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 3; +--:-:-:-:1 LOP.AND tidOX2, tid, 64; +--:-:-:-:1 LOP.OR tidOX, tidOX, tidOX2; +--:-:-:-:1 LOP.AND tidOY, tid, 63; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Expand back out to undo our bank conflict avoiding stride +--:-:-:-:1 SHL readIs, readIs, 1; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*64 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 6; + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X64_N64.sass b/Kernel/Convolution/Pascal/hconv_xprop_X64_N64.sass new file mode 100644 index 0000000..16b92c5 --- /dev/null +++ b/Kernel/Convolution/Pascal/hconv_xprop_X64_N64.sass @@ -0,0 +1,290 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $int16; + our $prefix = 'h'; + our $shareI = 64; + our $shareF = 64; + our $stepI = 32; + our $stepF = 32; + our $remapF = 1; + our $remapI = 1; + our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + sub convert {return $convert;} + +-] + + + + + addr_zero : 4x<64*8*2 + 64*8*2 + 0> + szShareF : (64*8) + szShareI : (64*8) + + addr_zero : 4x<64*8*2 + 64*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 64*8*2 + 4> + addr_m : 4x<64*8*2 + 64*8*2 + 4> + addr_p : 4x<64*8*2 + 64*8*2 + 5> + addr_q : 4x<64*8*2 + 64*8*2 + 6> + addr_k : 4x<64*8*2 + 64*8*2 + 7> + addr_szLut : 4x<64*8*2 + 64*8*2 + 8> + addr_lut : 4x<64*8*2 + 64*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-111 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-103 : loadI<0-3> + 100-103 : storeI<0-3> + 104-107 : storeI<4-7> + + 108-111 : loadF<0-3> + 108-111 : storeF<0-3> + 104-107 : storeF<4-7> + + 104-107 ~ offsetF + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 114-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc + 126-127 ~ readFs, readIs + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-125 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] + +[+ get_mpqk() +] + +// tidX = (tid & 7) << 3 +// tidY = tid >> 3 +--:-:-:-:1 LOP.AND tidX, tid, 7; +--:-:-:-:1 SHL tidX, tidX, 3; +--:-:-:-:1 SHR.U32 tidY, tid, 3; + +// trackF += blkF*64 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 6; + +// trackI += blkI*64 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 6; + +// Remap the X dim to avoid bank conflicts when storing to shared +// We can unmap this in the output +--:-:-:-:1 SHR.U32 tidX, tidX, 1; + +// writeS = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 6; +--:-:-:-:1 SHL writeS, writeS, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:0 SHL readFs, readFs, 4; + +// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:1:-:1 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:5:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; + +--:-:2:-:1 @P1 LDG.E.128 loadI0, [trackI]; +--:-:6:-:1 @!P1 LDS.U.128 loadI0, [addr_zero]; + +11:-:-:-:1 [+ convert() +] storeF7, loadF3.H1; +--:-:-:-:1 [+ convert() +] storeF6, loadF3.H0; +--:-:-:-:1 [+ convert() +] storeF5, loadF2.H1; +--:-:1:-:1 [+ convert() +] storeF4, loadF2.H0; +--:-:-:-:1 [+ convert() +] storeF3, loadF1.H1; +--:-:-:-:1 [+ convert() +] storeF2, loadF1.H0; +--:-:-:-:1 [+ convert() +] storeF1, loadF0.H1; +--:-:5:-:1 [+ convert() +] storeF0, loadF0.H0; + +01:1:-:-:1 STS.128 [writeS + 4x<0*64 + 32>], storeF4; +10:-:-:-:1 STS.128 [writeS + 4x<0*64 + 0>], storeF0; + +23:-:-:-:1 [+ convert() +] storeI7, loadI3.H1; +--:-:-:-:1 [+ convert() +] storeI6, loadI3.H0; +--:-:-:-:1 [+ convert() +] storeI5, loadI2.H1; +--:-:1:-:1 [+ convert() +] storeI4, loadI2.H0; +--:-:-:-:1 [+ convert() +] storeI3, loadI1.H1; +--:-:-:-:1 [+ convert() +] storeI2, loadI1.H0; +--:-:-:-:1 [+ convert() +] storeI1, loadI0.H1; +--:-:5:-:1 [+ convert() +] storeI0, loadI0.H0; + +01:-:-:-:1 STS.128 [writeS + 4x<8*64 + 32>], storeI4; +10:1:-:-:1 STS.128 [writeS + 4x<8*64 + 0>], storeI0; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>]; +--:-:3:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x< 0>]; + +[- + our $convert; + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c20 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j1c25 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j1c31 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j1c32 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j1c18 => "02:-:-:-:1 \@P0 $convert storeF7, loadF3.H1;\n", + j1c22 => "--:-:-:-:1 \@P0 $convert storeF6, loadF3.H0;\n", + j1c26 => "--:-:-:-:1 \@P0 $convert storeF5, loadF2.H1;\n", + j1c30 => "--:-:5:-:1 \@P0 $convert storeF4, loadF2.H0;\n", + j1c33 => "--:-:-:-:1 \@P0 $convert storeF3, loadF1.H1;\n", + j1c37 => "--:-:-:-:1 \@P0 $convert storeF2, loadF1.H0;\n", + j1c41 => "--:-:-:-:1 \@P0 $convert storeF1, loadF0.H1;\n", + j1c45 => "--:-:2:-:1 \@P0 $convert storeF0, loadF0.H0;\n", + + j1c47 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<0*64 + 32>], storeF4;\n", + j1c62 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*64 + 0>], storeF0;\n", + + j2c19 => "30:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c24 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 1;\n", + j2c26 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c28 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 1;\n", + + j2c30 => "02:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF];\n", + + j5c29 => "04:-:-:-:1 \@P0 $convert storeI7, loadI3.H1;\n", + j5c33 => "--:-:-:-:1 \@P0 $convert storeI6, loadI3.H0;\n", + j5c37 => "--:-:-:-:1 \@P0 $convert storeI5, loadI2.H1;\n", + j5c41 => "--:-:5:-:1 \@P0 $convert storeI4, loadI2.H0;\n", + j5c45 => "--:-:-:-:1 \@P0 $convert storeI3, loadI1.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 $convert storeI2, loadI1.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 $convert storeI1, loadI0.H1;\n", + j5c57 => "--:-:3:-:1 \@P0 $convert storeI0, loadI0.H0;\n", + + j5c59 => "10:-:-:-:1 \@P0 STS.128 [writeS + 4x<8*64 + 32>], storeI4;\n", + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<8*64 + 0>], storeI0;\n", + + j6c50 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 1;\n", + j6c55 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 1;\n", + + j6c61 => "04:-:3:-:1 \@P1 LDG.E.128 loadI0, [trackI];\n", + + j6c62 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 3 +// tidOY = tid >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 3; +--:-:-:-:1 SHR.U32 tidOY, tid, 3; + +--:-:-:-:1 LOP.AND readIs, readIs, 0x7ff; +--:-:-:-:1 LOP.AND readFs, readFs, 0x7ff; + +// Expand back out to undo our bank conflict avoiding stride +--:-:-:-:1 SHL readIs, readIs, 1; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 64 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 6; + +// Mul by 4 here expands k stride back out +// Mul by 2 again to undo the bank conflict avoiding stride +// k = blkF*64 + tidOY * 8 +--:-:-:-:1 SHL tidOY, tidOY, 3; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 6; + +[+ output_setup(63, 0, 6) +] + + + +[+ output() +] diff --git a/Kernel/Convolution/Pascal/persistent_rnn_bprop.sass b/Kernel/Convolution/Pascal/persistent_rnn_bprop.sass new file mode 100644 index 0000000..ddddb22 --- /dev/null +++ b/Kernel/Convolution/Pascal/persistent_rnn_bprop.sass @@ -0,0 +1,638 @@ +# Kernel: presistent_birnn + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(64*48)> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_d[0] : c[0x0][0x140] + param_d[1] : c[0x0][0x144] + param_dnext[0] : c[0x0][0x148] + param_dnext[1] : c[0x0][0x14c] + param_h[0] : c[0x0][0x150] + param_h[1] : c[0x0][0x154] + param_w[0] : c[0x0][0x158] + param_w[1] : c[0x0][0x15c] + param_lockAddr[0] : c[0x0][0x160] + param_lockAddr[1] : c[0x0][0x164] + param_ldd : c[0x0][0x168] + param_ldh : c[0x0][0x16c] + param_ldw : c[0x0][0x170] + param_bsz : c[0x0][0x174] + param_seqLength : c[0x0][0x178] + param_numBlks : c[0x0][0x17c] + param_rowSize : c[0x0][0x180] + param_reverse : c[0x0][0x184] + param_reluclip : c[0x0][0x188] + + + + + 0-215 : weight<000-215> + 216-227 : accum<00-11> + 228-231 : timeStep, warpTid, rowOffset, tid + + 232-235 : wAddr<0-1>, biasAddr<0-1> + 236-254 ~ bid, ldw, wRow, loadRow, tidLsbs, tidMsbs, warpIndex, storeWeights, loadWeights, outRow, rowSize + + 232-249 : loadBuffer<0-3>, delta0r<0-3>, delta1r<0-3>, delta2r<0-3>, dnextAddr<0-1> + 250-254 ~ loadDeltas, storeDeltas, loadIndex, dOffset, ldd + + 236-247 : peerR0V<0-3>, peerR1V<0-3>, peerR2V<0-3> + 244 : hOffset + 248-253 : h<0-3>, hAddr<0-1> + + 232-241 : output<0-3>, dAddr<0-1>, lockAddr<0-1>, expectVal, setVal + 241-245 ~ storeIndex, hRow, predSave, lockVal, reluclip + + + +//Get tid/block id +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R bid, SR_CTAID.X; + +//Store zeros at addr_zero +--:-:-:-:1 STS.128 [addr_zero], RZ; + + +--:-:-:-:1 MOV ldw, param_ldw; +--:-:-:-:1 MOV rowSize, param_rowSize; + +//timeStep = (param_reverse == 0) ? 0 : param_seqLength +--:-:-:-:1 ISETP.EQ.AND P2, PT, RZ, param_reverse, PT; +--:-:-:-:1 SEL timeStep, RZ, param_seqLength, P2; +--:-:-:-:1 @!P2 IADD timeStep, timeStep, -1; + +//warpIndex = threadIdx.x >> 5 +01:-:-:-:1 SHR.U32 warpIndex, tid, 5; + +//warpTid = threadIdx.x & 0x1f +01:-:-:-:1 LOP.AND warpTid, tid, 0x1f; + +//rowOffset = ((blockIdx.x << 3) + warp_index) * 6 +02:-:-:-:1 SHL rowOffset, bid, 3; +--:-:-:-:1 IADD rowOffset, rowOffset, warpIndex; +--:-:-:-:1 XMAD rowOffset, rowOffset, 6, RZ; + +//if(warp_tid > 15) rowOffset += 3 +--:-:-:-:1 ISETP.GT.AND P1, PT, warpTid, 15, PT; +--:-:-:-:1 @P1 IADD rowOffset, rowOffset, 3; + +//warpTid = warpTid & 0x0f +--:-:-:-:1 LOP.AND warpTid, warpTid, 0x0f; +--:-:-:-:1 ISETP.LT.AND P0, PT, warpTid, 3, PT; +--:-:-:-:1 IADD outRow, rowOffset, warpTid; +--:-:-:-:1 ISETP.LT.AND P0, PT, outRow, param_rowSize, P0; + +//storeWeights = (((tid >> 2) * 48) + ((tid & 3) << 2)) << 2 +//wRow = ((tid >> 2) * ldw) + ((tid & 3) << 2) + (bid * 48) +--:-:-:-:1 LOP.AND tidLsbs, warpTid, 0x03; +--:-:-:-:1 SHR tidMsbs, tid, 2; +--:-:-:-:1 SHL tidLsbs, tidLsbs, 2; + +--:-:-:-:1 XMAD loadRow, bid, 48, tidLsbs; +--:-:-:-:1 XMAD wRow, tidMsbs, ldw, loadRow; + +--:-:-:-:1 XMAD storeWeights, tidMsbs, 48, tidLsbs; +--:-:-:-:1 SHL storeWeights, storeWeights, 2; + +//loadWeights = (((warpTid * 8) + warpIndex) * 6) + (P1 ? 3 : 0)) << 2 +--:-:-:-:1 XMAD loadWeights, warpTid, 8, warpIndex; +--:-:-:-:1 XMAD loadWeights, loadWeights, 6, RZ; +--:-:-:-:1 @P1 IADD loadWeights, loadWeights, 3; +--:-:-:-:1 SHL loadWeights, loadWeights, 2; + +//wAddr = &w[wRow] +--:-:-:-:1 LEA wAddr0.CC, wRow, param_w[0], 2; +--:-:-:-:1 LEA.HI.X wAddr1, wRow, param_w[1], RZ, 2; + +//ldw = ldw << 6 +--:-:-:-:1 SHL ldw, ldw, 8; + +//Compute row loading predicates +--:-:-:-:1 ISETP.LT.AND P1, PT, tidMsbs, rowSize, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, loadRow, rowSize, P1; +--:-:-:-:1 IADD rowSize, rowSize, -16; +--:-:-:-:1 ISETP.LT.AND P4, PT, loadRow, rowSize, P1; +--:-:-:-:1 IADD rowSize, rowSize, -16; +--:-:-:-:1 ISETP.LT.AND P5, PT, loadRow, rowSize, P1; + + +--:-:-:Y:c NOP; + +//Load weights to registers + + my $out; + my $regId = 0; + my $rowsize = 1152; + + for (my $col=0; $col < $rowsize; $col += 64) + { + $out .= "--:-:-:-:1 IADD tidMsbs, tidMsbs, 64;\n"; + + #Use vector loads from weight matrix + $regId = $col / 16; + $out .= sprintf "--:-:1:-:1 \@P3 LDG.E.128 weight%03d, [wAddr];\n", $regId; + $out .= sprintf "--:-:1:-:1 \@!P3 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + $regId = $col / 16 + 72; + $out .= sprintf "--:-:2:-:1 \@P4 LDG.E.128 weight%03d, [wAddr + 4x<16>];\n", $regId; + $out .= sprintf "--:-:2:-:1 \@!P4 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + $regId = $col / 16 + 144; + $out .= sprintf "--:-:3:-:1 \@P5 LDG.E.128 weight%03d, [wAddr + 4x<32>];\n", $regId; + $out .= sprintf "--:-:3:-:1 \@!P5 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + + $out .= "--:-:-:-:1 ISETP.LT.AND P3, PT, tidMsbs, param_rowSize, P3;\n"; + $out .= "--:-:-:-:1 ISETP.LT.AND P4, PT, tidMsbs, param_rowSize, P4;\n"; + $out .= "--:-:-:-:1 ISETP.LT.AND P5, PT, tidMsbs, param_rowSize, P5;\n"; + + #Store weights into shared memory + if ($col > 0) + { + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + } + + $regId = $col / 16; + $out .= sprintf "01:-:-:-:1 STS.U.128 [storeWeights], weight%03d;\n", $regId; + $regId = $col / 16 + 72; + $out .= sprintf "02:-:-:-:1 STS.U.128 [storeWeights + 4x<16>], weight%03d;\n", $regId; + $regId = $col / 16 + 144; + $out .= sprintf "04:-:-:-:1 STS.U.128 [storeWeights + 4x<32>], weight%03d;\n", $regId; + + $out .= "--:-:-:-:6 IADD wAddr0.CC, wAddr0, ldw;\n"; + $out .= "--:-:-:-:1 IADD.X wAddr1, wAddr1, RZ;\n\n"; + + #Load each weight from shared mem + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + + foreach my $row (0 .. 2) + { + foreach my $shared_col (0 .. 3) + { + my $control; + + if (($col + 64) >= $rowsize && $row == 2 && $shared_col == 3) + { + $control = "--:1:6:-:2"; + } + else + { + $control = "--:-:-:-:1"; + } + + $regId = ($row * 72) + ($col / 16) + $shared_col; + my $shared_offset = $row + ($shared_col * 16 * 48); + $out .= sprintf "%s LDS.U weight%03d, [loadWeights + 4x<%d>];\n", $control, $regId, $shared_offset; + } + } + } + + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + + return $out; + + + +//Predicates for store code +--:-:-:-:1 ISETP.EQ.AND P2, PT, warpTid, 0, PT; +--:-:-:-:1 ISETP.EQ.AND P3, PT, warpTid, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P4, PT, warpTid, 2, PT; + +UNROLLING_LOOP: + +//Prime inner product loop by loading first rows of dnext +--:-:-:-:1 MOV loadIndex, tid; + +//storeDeltas = tid << 4 +--:-:-:-:1 SHL storeDeltas, tid, 4; +--:-:-:-:1 SHL loadDeltas, warpTid, 4; + +//dnextAddr = &d_next[timeStep * ldd + loadIndex] +--:-:-:-:1 XMAD dOffset, loadIndex, param_ldd, timeStep; +--:-:-:-:1 LEA dnextAddr0.CC, dOffset, param_dnext[0], 4; +01:-:-:-:2 LEA.HI.X dnextAddr1, dOffset, param_dnext[1], RZ, 4; + +//loadBuffer = *dnextAddr +--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT; +--:5:1:-:2 @P1 LDG.E.CI.128 loadBuffer, [dnextAddr]; +--:5:1:-:2 @!P1 LDS.U.CI.128 loadBuffer, [addr_zero]; + +//ldd = param_ldd << 12 +--:-:-:-:1 MOV ldd, param_ldd; +--:-:-:-:1 SHL ldd, ldd, 12; + + +//Initialize all accumulation registers to 0 + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 accum%02d, [addr_zero];\n", $_ * 4), 0..2; + + +//Update load index and load address +--:-:-:-:6 IADD loadIndex, loadIndex, 256; +--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT; +10:-:-:-:6 IADD dnextAddr0.CC, dnextAddr0, ldd; +--:-:-:-:6 IADD.X dnextAddr1, dnextAddr1, RZ; + +01:-:-:-:1 STS.U.128 [storeDeltas], loadBuffer; + +//Unrolled GEMM loop + + our @top; + + my $out = join '', @top; + + my $rowsize = 1152; + my $weight_index = 0; + + my $wait_flag = 2; + my $set_flag = 4; + my $read_buffer = 0; + my $write_buffer = 2; + + for (my $k=0; $k < $rowsize; $k+=256) + { + if ($k == 0) + { + $out .= "--:6:1:-:1 \@P1 LDG.E.CI.128 loadBuffer, [dnextAddr];\n"; + $out .= "--:-:1:-:1 \@!P1 LDS.U.128 loadBuffer, [addr_zero];\n\n"; + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + $out .= "--:-:2:-:1 LDS.U.128 delta0r, [loadDeltas];\n"; + $out .= "--:-:3:-:1 LDS.U.128 delta1r, [loadDeltas + 4x<4*16>];\n\n"; + } + $out .= "--:-:-:-:1 LOP.XOR storeDeltas, storeDeltas, 4096;\n"; + + foreach my $shared_row (0 .. 15) + { + if($weight_index < 72) + { + if ($shared_row < 14 && ($k + (16 * ($shared_row + 2))) < $rowsize) + { + my $read_bar = "-"; + if ($shared_row == 13 && ($k + 256) < $rowsize) + { + $read_bar = "5"; + } + $out .= sprintf "--:%s:%d:-:1 LDS.U.128 delta%dr, [loadDeltas + 4x<4*%d>];\n", $read_bar, $set_flag, $write_buffer, (16 * ($shared_row + 2)); + } + + if ($shared_row == 11 && ($k + 512) < $rowsize) + { + $out .= "--:-:-:-:1 IADD loadIndex, loadIndex, 256;\n"; + $out .= "20:-:-:-:1 IADD dnextAddr0.CC, dnextAddr0, ldd;\n"; + } + + if ($shared_row == 12 && ($k + 512) < $rowsize) + { + $out .= "--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;\n"; + $out .= "--:-:-:-:1 IADD.X dnextAddr1, dnextAddr1, RZ;\n"; + } + + if ($shared_row == 13) + { + $out .= "01:-:-:-:1 STS.U.128 [storeDeltas], loadBuffer;\n"; + + if(($k + 512) < $rowsize) + { + $out .= "--:6:1:-:1 \@P1 LDG.E.CI.128 loadBuffer, [dnextAddr];\n"; + $out .= "--:-:1:-:1 \@!P1 LDS.U.128 loadBuffer, [addr_zero];\n\n"; + } + else + { + $out .= "--:-:-:-:6 IADD dOffset, rowOffset, warpTid;\n"; + $out .= "--:-:-:-:6 XMAD dOffset, dOffset, param_ldd, timeStep;\n"; + $out .= "--:-:-:-:6 LEA dnextAddr0.CC, dOffset, param_d[0], 4;\n"; + $out .= "--:-:-:-:2 LEA.HI.X dnextAddr1, dOffset, param_d[1], RZ, 4;\n"; + $out .= "--:-:6:-:1 \@P0 LDG.E.CI.128 loadBuffer, [dnextAddr];\n\n"; + } + } + + if ($shared_row == 14 && ($k + 256) < $rowsize) + { + $out .= "10:-:-:-:1 LOP.XOR loadDeltas, loadDeltas, 4096;\n"; + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + $out .= sprintf "--:-:%d:-:1 LDS.U.128 delta%dr, [loadDeltas];\n", $set_flag, $write_buffer; + } + + if ($shared_row == 15 && ($k + 256) < $rowsize) + { + $out .= sprintf "--:-:%d:-:1 LDS.U.128 delta%dr, [loadDeltas + 4x<4*16>];\n\n", $set_flag, $write_buffer; + } + + foreach my $row (0 .. 2) + { + my $weight = ($row * 72) + $weight_index; + + foreach my $col (0 .. 3) + { + my $accum = ($row * 4) + $col; + my $wait = "--"; + my $stall = 1; + if ($accum == 0) + { + if ($weight_index == 0) + { + $wait = sprintf "%02x", (0x20 | (1 << ($wait_flag - 1))); + } + else + { + $wait = sprintf "%02x", (1 << ($wait_flag - 1)); + } + } + + if ($row == 2 && $col == 3) + { + if ($shared_row < 13 && ($k + (16 * ($shared_row + 3))) < $rowsize) + { + $stall = 0; + } + elsif ($shared_row == 14 && ($k + 256) < $rowsize) + { + $stall = 0; + } + } + + $out .= sprintf "%s:-:-:-:%d FFMA accum%02d, weight%03d, delta%dr%d, accum%02d;\n", $wait, $stall, $accum, $weight, $read_buffer, $col, $accum; + } + } + + $weight_index++; + } + + $wait_flag += 1; + $set_flag += 1; + $read_buffer += 1; + $write_buffer += 1; + if($wait_flag == 5) + { + $wait_flag = 2; + } + if($set_flag == 5) + { + $set_flag = 2; + } + if($read_buffer == 3) + { + $read_buffer = 0; + } + if($write_buffer == 3) + { + $write_buffer = 0; + } + } + } + + return $out; + + +//Load hidden states +--:-:-:-:6 IADD hOffset, rowOffset, warpTid; +--:-:-:-:6 XMAD hOffset, hOffset, param_ldh, timeStep; +--:-:-:-:6 LEA hAddr0.CC, hOffset, param_h[0], 4; +--:-:-:-:2 LEA.HI.X hAddr1, hOffset, param_h[1], RZ, 4; +--:-:5:-:1 @P0 LDG.E.CI.128 h, [hAddr]; + +//Reduction between threads +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 1, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 1, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 1, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 1, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 1, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 2, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 2, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 2, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 2, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 2, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 2, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 2, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 2, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 4, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 4, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 4, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 4, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 4, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 4, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 4, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 4, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 8, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 8, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 8, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 8, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 8, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 8, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 8, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 8, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:5 MOV reluclip, param_reluclip; + +//Compute store pointer + +--:-:-:-:1 IADD hRow, rowOffset, warpTid; +--:-:-:-:1 XMAD storeIndex, hRow, param_ldd, timeStep; +--:-:-:-:1 LEA dAddr0.CC, storeIndex, param_d[0], 4; +--:-:-:-:1 LEA.HI.X dAddr1, storeIndex, param_d[1], RZ, 4; +--:-:-:-:1 LEA lockAddr0, timeStep, param_lockAddr[0], 2; +--:-:-:-:1 LEA.HI.X lockAddr1, timeStep, param_lockAddr[1], RZ, 2; + +//Conditional select for output +//TODO: make sure scheduler orders these such that first one waits on barrier +20:-:-:-:1 @P2 FADD output0, output0, accum00; +20:-:-:-:1 @P3 FADD output0, output0, accum04; +20:-:-:-:1 @P4 FADD output0, output0, accum08; + +20:-:-:-:1 @P2 FADD output1, output1, accum01; +20:-:-:-:1 @P3 FADD output1, output1, accum05; +20:-:-:-:1 @P4 FADD output1, output1, accum09; + +20:-:-:-:1 @P2 FADD output2, output2, accum02; +20:-:-:-:1 @P3 FADD output2, output2, accum06; +20:-:-:-:1 @P4 FADD output2, output2, accum10; + +20:-:-:-:1 @P2 FADD output3, output3, accum03; +20:-:-:-:1 @P3 FADD output3, output3, accum07; +20:-:-:-:3 @P4 FADD output3, output3, accum11; + + +//Save select predicates +//TODO: how many stall cycles needed here? +--:-:-:-:6 P2R predSave, PR, RZ, 0x1e; + +//Multiply by bprop for reclinclip activation function +//TODO: others + +10:-:-:-:1 FSETP.LT.AND P2, PT, RZ, h0, PT; +10:-:-:-:1 FSETP.LT.AND P3, PT, RZ, h1, PT; +10:-:-:-:1 FSETP.LT.AND P4, PT, RZ, h2, PT; +10:-:-:-:1 FSETP.LT.AND P5, PT, RZ, h3, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, h0, reluclip, P2; +--:-:-:-:1 FSETP.LT.AND P3, PT, h1, reluclip, P3; +--:-:-:-:1 FSETP.LT.AND P4, PT, h2, reluclip, P4; +--:-:-:-:1 FSETP.LT.AND P5, PT, h3, reluclip, P5; +--:-:-:-:1 @!P2 FMUL output0, output0, RZ; +--:-:-:-:1 @!P3 FMUL output1, output1, RZ; +--:-:-:-:1 @!P4 FMUL output2, output2, RZ; +--:-:-:-:1 @!P5 FMUL output3, output3, RZ; + +//Update timestep +--:-:-:-:1 ISETP.EQ.AND P1, PT, RZ, param_reverse, PT; +--:-:-:-:1 @P1 MOV setVal, 1; +--:-:-:-:1 @!P1 MOV setVal, -1; +--:-:-:-:1 @P1 MOV expectVal, param_seqLength; +--:-:-:-:1 @!P1 MOV expectVal, -1; +--:-:-:-:1 IADD timeStep, timeStep, setVal; + + +//Conditional store +--:-:-:-:5 @P0 STG.E.CI.128 [dAddr], output; + +//Compute predicate for time unrolling loop +--:-:-:Y:d ISETP.NE.AND P5, PT, timeStep, expectVal, PT; + +//P2 = (tid != 0) +//setVal = 1 +--:-:-:-:1 ISETP.NE.AND P2, PT, tid, RZ, PT; +--:-:-:-:1 MOV expectVal, param_numBlks; +--:-:-:Y:b MOV setVal, 1; + +//Barrier for all blocks +--:-:-:-:f MEMBAR.GL; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:2 SSY SSY_TARGET1; +--:-:-:-:d @P2 SYNC; + +--:-:-:Y:2 ATOM.E.ADD RZ, [lockAddr], setVal; +--:-:-:-:d SYNC; + +SSY_TARGET1: +--:-:-:-:1 SSY SSY_TARGET2; +--:-:-:-:d @P2 SYNC; + +SPINLOCK: +--:-:1:Y:2 LDG.E lockVal, [lockAddr]; +01:-:-:Y:d ISETP.NE.AND P2, PT, lockVal, expectVal, PT; +--:-:-:-:5 @P2 BRA.U SPINLOCK; +--:-:-:-:d SYNC; + +SSY_TARGET2: +--:-:-:-:5 BAR.SYNC 0; + +//Restore select predicates +--:-:-:-:1 R2P PR, predSave, 0x1e; + +//Conditional branch back to beginning of loop +--:-:-:Y:5 @P5 BRA.U UNROLLING_LOOP; + +--:-:-:-:5 EXIT; diff --git a/Kernel/Convolution/Pascal/persistent_rnn_fprop.sass b/Kernel/Convolution/Pascal/persistent_rnn_fprop.sass new file mode 100644 index 0000000..6a11539 --- /dev/null +++ b/Kernel/Convolution/Pascal/persistent_rnn_fprop.sass @@ -0,0 +1,653 @@ +# Kernel: presistent_birnn + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(64*48)> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_h[0] : c[0x0][0x140] + param_h[1] : c[0x0][0x144] + param_hprev[0] : c[0x0][0x148] + param_hprev[1] : c[0x0][0x14c] + param_bias[0] : c[0x0][0x150] + param_bias[1] : c[0x0][0x154] + param_w[0] : c[0x0][0x158] + param_w[1] : c[0x0][0x15c] + param_lockAddr[0] : c[0x0][0x160] + param_lockAddr[1] : c[0x0][0x164] + param_ldh : c[0x0][0x168] + param_ldw : c[0x0][0x16c] + param_bsz : c[0x0][0x170] + param_seqLength : c[0x0][0x174] + param_numBlks : c[0x0][0x178] + param_rowSize : c[0x0][0x17c] + param_reverse : c[0x0][0x180] + param_reluclip : c[0x0][0x184] + + + + + 0-215 : weight<000-215> + 216-227 : accum<00-11> + 228-229 : timeStep, biasValue + 230-232 : warpTid, rowOffset, tid + + 233 : bid + + 236-243 : wAddr0r<0-1>, wAddr1r<0-1>, wAddr2r<0-1>, biasAddr<0-1> + 244-254 ~ ldw, wRow, warpTid4, loadRow, warpIndex, storeWeights, loadWeights, rowSize + + 233 : hOffset + 233 : ldh + 234-239 : hprevAddr<0-1>, loadBuffer<0-3> + 240-251 : hidden0r<0-3>, hidden1r<0-3>, hidden2r<0-3> + 252-254 ~ loadHiddens, storeHiddens, loadIndex + + 240-251 : peerR0V<0-3>, peerR1V<0-3>, peerR2V<0-3> + + 240-249 : output<0-3>, hAddr<0-1>, lockAddr<0-1>, expectVal, setVal + 250-254 ~ storeIndex, hRow, predSave, lockVal, reluclip + + + +//Get tid/block id +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R bid, SR_CTAID.X; + +//Store zeros at addr_zero +--:-:-:-:1 STS.128 [addr_zero], RZ; + + +--:-:-:-:1 MOV ldw, param_ldw; +--:-:-:-:1 MOV rowSize, param_rowSize; + +//timeStep = (param_reverse == 0) ? 0 : param_seqLength +--:-:-:-:1 ISETP.EQ.AND P2, PT, RZ, param_reverse, PT; +--:-:-:-:1 SEL timeStep, RZ, param_seqLength, P2; +--:-:-:-:1 @!P2 IADD timeStep, timeStep, -1; + +//warpIndex = threadIdx.x >> 5 +01:-:-:-:1 SHR.U32 warpIndex, tid, 5; + +//warpTid = threadIdx.x & 0x1f +01:-:-:-:1 LOP.AND warpTid, tid, 0x1f; + +//rowOffset = ((blockIdx.x << 3) + warp_index) * 6 +02:-:-:-:1 SHL rowOffset, bid, 3; +--:-:-:-:1 IADD rowOffset, rowOffset, warpIndex; +--:-:-:-:1 XMAD rowOffset, rowOffset, 6, RZ; + +//if(warp_tid > 15) rowOffset += 3 +--:-:-:-:1 ISETP.GT.AND P1, PT, warpTid, 15, PT; +--:-:-:-:1 @P1 IADD rowOffset, rowOffset, 3; + +//warpTid = warpTid & 0x0f +--:-:-:-:1 LOP.AND warpTid, warpTid, 0x0f; +--:-:-:-:1 ISETP.LT.AND P0, PT, warpTid, 3, PT; + +//warpTid4 = warpTid << 2 +--:-:-:-:1 SHL warpTid4, warpTid, 2; + +//storeWeights = ((P1) ? (warpTid4 + 3*64) : warpTid4) << 2 +//loadWeights = ((P1) ? (warpTid + 3*64) : warpTid) << 2 +--:-:-:-:1 @P1 MOV loadWeights, 3; +--:-:-:-:1 @!P1 MOV loadWeights, RZ; + +--:-:-:-:1 XMAD loadWeights, warpIndex, 6, loadWeights; +--:-:-:-:1 SHL loadWeights, loadWeights, 6; + +--:-:-:-:1 IADD storeWeights, loadWeights, warpTid4; +--:-:-:-:1 IADD loadWeights, loadWeights, warpTid; +--:-:-:-:1 SHL storeWeights, storeWeights, 2; +--:-:-:-:1 SHL loadWeights, loadWeights, 2; + +//wRow = rowOffset * ldw + warpTid +--:-:-:-:1 XMAD wRow, rowOffset, ldw, warpTid4; + +//wAddr0r = &w[wRow] +--:-:-:-:1 LEA wAddr0r0.CC, wRow, param_w[0], 2; +--:-:-:-:1 LEA.HI.X wAddr0r1, wRow, param_w[1], RZ, 2; + +//ldw = ldw << 2 +--:-:-:-:1 SHL ldw, ldw, 2; + +//wAddr1r = wAddr0r + ldw +--:-:-:-:1 IADD wAddr1r0.CC, wAddr0r0, ldw; +--:-:-:-:1 IADD.X wAddr1r1, wAddr0r1, RZ; + +//wAddr2r = wAddr2r + ldw +--:-:-:-:1 IADD wAddr2r0.CC, wAddr1r0, ldw; +--:-:-:-:1 IADD.X wAddr2r1, wAddr1r1, RZ; + +//Compute row loading predicates +--:-:-:-:1 ISETP.LT.AND P1, PT, warpTid4, rowSize, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, rowOffset, rowSize, P1; +--:-:-:-:1 IADD rowSize, rowSize, -1; +--:-:-:-:1 ISETP.LT.AND P4, PT, rowOffset, rowSize, P1; +--:-:-:-:1 IADD rowSize, rowSize, -1; +--:-:-:-:1 ISETP.LT.AND P5, PT, rowOffset, rowSize, P1; + + +--:-:-:Y:c NOP; + +//Load weights to registers + + my $out; + my $regId = 0; + my $rowsize = 1152; + + for (my $col=0; $col < $rowsize; $col += 64) + { + $out .= "--:-:-:-:1 IADD warpTid4, warpTid4, 64;\n"; + + #Use vector loads from weight matrix + $regId = $col / 16; + $out .= sprintf "--:-:1:-:1 \@P3 LDG.E.128 weight%03d, [wAddr0r + 4x<%d>];\n", $regId, $col; + $out .= sprintf "--:-:1:-:1 \@!P3 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + $regId = $col / 16 + 72; + $out .= sprintf "--:-:2:-:1 \@P4 LDG.E.128 weight%03d, [wAddr1r + 4x<%d>];\n", $regId, $col; + $out .= sprintf "--:-:2:-:1 \@!P4 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + $regId = $col / 16 + 144; + $out .= sprintf "--:-:3:-:1 \@P5 LDG.E.128 weight%03d, [wAddr2r + 4x<%d>];\n", $regId, $col; + $out .= sprintf "--:-:3:-:1 \@!P5 LDS.U.128 weight%03d, [addr_zero];\n", $regId; + + $out .= "--:-:-:-:1 ISETP.LT.AND P3, PT, warpTid4, rowSize, P3;\n"; + $out .= "--:-:-:-:1 ISETP.LT.AND P4, PT, warpTid4, rowSize, P4;\n"; + $out .= "--:-:-:-:1 ISETP.LT.AND P5, PT, warpTid4, rowSize, P5;\n"; + + #Store weights into shared memory + if ($col > 0) + { + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + } + + $regId = $col / 16; + $out .= sprintf "01:-:-:-:1 STS.U.128 [storeWeights], weight%03d;\n", $regId; + $regId = $col / 16 + 72; + $out .= sprintf "02:-:-:-:1 STS.U.128 [storeWeights + 4x<64>], weight%03d;\n", $regId; + $regId = $col / 16 + 144; + $out .= sprintf "04:-:-:-:1 STS.U.128 [storeWeights + 4x<128>], weight%03d;\n", $regId; + + #Load each weight from shared mem + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + + foreach my $shared_col (0 .. 3) + { + foreach my $row (0 .. 2) + { + my $control; + + if (($col + 64) >= $rowsize && $row == 2 && $shared_col == 3) + { + $control = "--:1:6:-:2"; + } + else + { + $control = "--:-:-:-:1"; + } + + $regId = ($row * 72) + ($col / 16) + $shared_col; + my $shared_offset = ($row * 64) + ($shared_col * 16); + $out .= sprintf "%s LDS.U weight%03d, [loadWeights + 4x<%d>];\n", $control, $regId, $shared_offset; + } + } + } + + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + + return $out; + + + +//Conditional load of bias + +01:-:-:-:1 IADD loadRow, rowOffset, warpTid; +--:-:-:-:1 ISETP.LT.AND P0, PT, loadRow, param_rowSize, P0; +--:-:-:-:1 LEA biasAddr0.CC, loadRow, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X biasAddr1, loadRow, param_bias[1], RZ, 2; +--:-:-:-:1 @P0 LDG.E biasValue, [biasAddr]; +--:-:-:-:1 @!P0 MOV biasValue, RZ; + + +//Predicates for store code +--:-:-:-:1 ISETP.EQ.AND P2, PT, warpTid, 0, PT; +--:-:-:-:1 ISETP.EQ.AND P3, PT, warpTid, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P4, PT, warpTid, 2, PT; + +UNROLLING_LOOP: + +//Prime inner product loop by loading first rows of hprev +--:-:-:-:1 MOV loadIndex, tid; + +//storeHiddens = tid << 4 +--:-:-:-:1 SHL storeHiddens, tid, 4; +--:-:-:-:1 SHL loadHiddens, warpTid, 4; + +//hprevAddr = &h_prev[timeStep * ldh + loadIndex] +--:-:-:-:1 XMAD hOffset, loadIndex, param_ldh, timeStep; +--:-:-:-:1 LEA hprevAddr0.CC, hOffset, param_hprev[0], 4; +--:-:-:-:2 LEA.HI.X hprevAddr1, hOffset, param_hprev[1], RZ, 4; + +//loadBuffer = *hprevAddr +--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT; +--:5:1:-:2 @P1 LDG.E.CI.128 loadBuffer, [hprevAddr]; +--:5:1:-:2 @!P1 LDS.U.128 loadBuffer, [addr_zero]; + +//ldh = param_ldh << 12 +--:-:-:-:1 MOV ldh, param_ldh; +--:-:-:-:1 SHL ldh, ldh, 12; + + +//Initialize all accumulation registers to 0 + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 accum%02d, [addr_zero];\n", $_ * 4), 0..2; + + +//Update load index and load address +--:-:-:-:6 IADD loadIndex, loadIndex, 256; +--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT; +10:-:-:-:6 IADD hprevAddr0.CC, hprevAddr0, ldh; +--:-:-:-:6 IADD.X hprevAddr1, hprevAddr1, RZ; + +01:-:-:-:1 STS.U.128 [storeHiddens], loadBuffer; + +//Unrolled GEMM loop + + our @top; + + my $out = join '', @top; + + my $rowsize = 1152; + my $weight_index = 0; + + my $wait_flag = 2; + my $set_flag = 4; + my $read_buffer = 0; + my $write_buffer = 2; + + for (my $k=0; $k < $rowsize; $k+=256) + { + if ($k == 0) + { + $out .= "--:6:1:-:1 \@P1 LDG.E.CI.128 loadBuffer, [hprevAddr];\n"; + $out .= "--:-:1:-:1 \@!P1 LDS.U.128 loadBuffer, [addr_zero];\n\n"; + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + $out .= "--:-:2:-:1 LDS.U.128 hidden0r, [loadHiddens];\n"; + $out .= "--:-:3:-:1 LDS.U.128 hidden1r, [loadHiddens + 4x<4*16>];\n\n"; + } + $out .= "--:-:-:-:1 LOP.XOR storeHiddens, storeHiddens, 4096;\n"; + + foreach my $shared_row (0 .. 15) + { + if($weight_index < 72) + { + if ($shared_row < 14 && ($k + (16 * ($shared_row + 2))) < $rowsize) + { + my $read_bar = "-"; + if ($shared_row == 13 && ($k + 256) < $rowsize) + { + $read_bar = "5"; + } + $out .= sprintf "--:%s:%d:-:1 LDS.U.128 hidden%dr, [loadHiddens + 4x<4*%d>];\n", $read_bar, $set_flag, $write_buffer, (16 * ($shared_row + 2)); + } + + if ($shared_row == 11) + { + $out .= "--:-:-:-:1 IADD loadIndex, loadIndex, 256;\n"; + $out .= "20:-:-:-:1 IADD hprevAddr0.CC, hprevAddr0, ldh;\n"; + } + + if ($shared_row == 12) + { + $out .= "--:-:-:-:1 ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;\n"; + $out .= "--:-:-:-:1 IADD.X hprevAddr1, hprevAddr1, RZ;\n"; + } + + if ($shared_row == 13) + { + $out .= "01:-:-:-:1 STS.U.128 [storeHiddens], loadBuffer;\n"; + + if (($k + 512) < $rowsize) + { + $out .= "--:6:1:-:1 \@P1 LDG.E.CI.128 loadBuffer, [hprevAddr];\n"; + $out .= "--:-:1:-:1 \@!P1 LDS.U.128 loadBuffer, [addr_zero];\n\n"; + } + else + { + $out .= "--:-:-:-:6 IADD hOffset, rowOffset, warpTid;\n"; + $out .= "--:-:-:-:6 XMAD hOffset, hOffset, param_ldh, timeStep;\n"; + $out .= "--:-:-:-:6 LEA hprevAddr0.CC, hOffset, param_h[0], 4;\n"; + $out .= "--:-:-:-:2 LEA.HI.X hprevAddr1, hOffset, param_h[1], RZ, 4;\n"; + $out .= "--:-:6:-:1 \@P0 LDG.E.CI.128 loadBuffer, [hprevAddr];\n\n"; + } + } + + if ($shared_row == 14) + { + $out .= "10:-:-:-:1 LOP.XOR loadHiddens, loadHiddens, 4096;\n"; + $out .= "--:-:-:-:5 BAR.SYNC 0;\n\n"; + $out .= sprintf "--:-:%d:-:1 LDS.U.128 hidden%dr, [loadHiddens];\n", $set_flag, $write_buffer; + } + + if ($shared_row == 15) + { + $out .= sprintf "--:-:%d:-:1 LDS.U.128 hidden%dr, [loadHiddens + 4x<4*16>];\n\n", $set_flag, $write_buffer; + } + + foreach my $row (0 .. 2) + { + my $weight = ($row * 72) + $weight_index; + + foreach my $col (0 .. 3) + { + my $accum = ($row * 4) + $col; + my $wait = "--"; + my $stall = 1; + if ($accum == 0) + { + if ($weight_index == 0) + { + $wait = sprintf "%02x", (0x20 | (1 << ($wait_flag - 1))); + } + else + { + $wait = sprintf "%02x", (1 << ($wait_flag - 1)); + } + } + + if ($row == 2 && $col == 3) + { + if ($shared_row < 13 && ($k + (16 * ($shared_row + 3))) < $rowsize) + { + $stall = 0; + } + elsif ($shared_row == 14 && ($k + 256) < $rowsize) + { + $stall = 0; + } + } + + $out .= sprintf "%s:-:-:-:%d FFMA accum%02d, weight%03d, hidden%dr%d, accum%02d;\n", $wait, $stall, $accum, $weight, $read_buffer, $col, $accum; + } + } + + $weight_index++; + } + + $wait_flag += 1; + $set_flag += 1; + $read_buffer += 1; + $write_buffer += 1; + if($wait_flag == 5) + { + $wait_flag = 2; + } + if($set_flag == 5) + { + $set_flag = 2; + } + if($read_buffer == 3) + { + $read_buffer = 0; + } + if($write_buffer == 3) + { + $write_buffer = 0; + } + } + } + + return $out; + + +//Reduction between threads +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 1, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 1, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 1, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 1, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 1, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 1, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 2, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 2, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 2, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 2, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 2, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 2, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 2, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 2, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 4, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 4, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 4, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 4, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 4, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 4, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 4, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 4, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V0, accum00, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V0, accum04, 8, 0x1f; +--:-:1:-:1 SHFL.BFLY PT, peerR2V0, accum08, 8, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V1, accum01, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V1, accum05, 8, 0x1f; +--:-:2:-:1 SHFL.BFLY PT, peerR2V1, accum09, 8, 0x1f; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V2, accum02, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V2, accum06, 8, 0x1f; +--:-:3:-:1 SHFL.BFLY PT, peerR2V2, accum10, 8, 0x1f; + +--:-:-:-:1 SHFL.BFLY PT, peerR0V3, accum03, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, peerR1V3, accum07, 8, 0x1f; +--:-:4:-:1 SHFL.BFLY PT, peerR2V3, accum11, 8, 0x1f; + +01:-:-:-:1 FADD accum00, accum00, peerR0V0; +--:-:-:-:1 FADD accum04, accum04, peerR1V0; +--:-:-:-:1 FADD accum08, accum08, peerR2V0; + +02:-:-:-:1 FADD accum01, accum01, peerR0V1; +--:-:-:-:1 FADD accum05, accum05, peerR1V1; +--:-:-:-:1 FADD accum09, accum09, peerR2V1; + +04:-:-:-:1 FADD accum02, accum02, peerR0V2; +--:-:-:-:1 FADD accum06, accum06, peerR1V2; +--:-:-:-:1 FADD accum10, accum10, peerR2V2; + +08:-:-:-:1 FADD accum03, accum03, peerR0V3; +--:-:-:-:1 FADD accum07, accum07, peerR1V3; +--:-:-:-:1 FADD accum11, accum11, peerR2V3; + +//Compute store pointer + +--:-:-:-:1 IADD hRow, rowOffset, warpTid; +--:-:-:-:1 XMAD storeIndex, hRow, param_ldh, timeStep; +--:-:-:-:1 LEA hAddr0.CC, storeIndex, param_h[0], 4; +--:-:-:-:1 LEA.HI.X hAddr1, storeIndex, param_h[1], RZ, 4; +--:-:-:-:1 LEA lockAddr0, timeStep, param_lockAddr[0], 2; +--:-:-:-:1 LEA.HI.X lockAddr1, timeStep, param_lockAddr[1], RZ, 2; + +//Conditional select for output +--:-:-:-:1 @P2 MOV output0, accum00; +--:-:-:-:1 @P3 MOV output0, accum04; +--:-:-:-:1 @P4 MOV output0, accum08; + +--:-:-:-:1 @P2 MOV output1, accum01; +--:-:-:-:1 @P3 MOV output1, accum05; +--:-:-:-:1 @P4 MOV output1, accum09; + +--:-:-:-:1 @P2 MOV output2, accum02; +--:-:-:-:1 @P3 MOV output2, accum06; +--:-:-:-:1 @P4 MOV output2, accum10; + +--:-:-:-:1 @P2 MOV output3, accum03; +--:-:-:-:1 @P3 MOV output3, accum07; +--:-:-:-:3 @P4 MOV output3, accum11; + +//Update timestep +--:-:-:-:1 ISETP.EQ.AND P5, PT, RZ, param_reverse, PT; +--:-:-:-:1 @P5 MOV setVal, 1; +--:-:-:-:1 @!P5 MOV setVal, -1; +--:-:-:-:1 @P5 MOV expectVal, param_seqLength; +--:-:-:-:1 @!P5 MOV expectVal, -1; +--:-:-:-:1 IADD timeStep, timeStep, setVal; + + +//Save select predicates +--:-:-:-:1 P2R predSave, PR, RZ, 0x0c; + +--:-:-:-:1 MOV reluclip, param_reluclip; + +//Add bias for output +--:-:-:-:1 FADD output0, output0, biasValue; +--:-:-:-:1 FADD output1, output1, biasValue; +--:-:-:-:1 FADD output2, output2, biasValue; +--:-:-:-:3 FADD output3, output3, biasValue; + +//Accumulate on top of current data +20:-:-:-:1 FADD output0, output0, loadBuffer0; +--:-:-:-:1 FADD output1, output1, loadBuffer1; +--:-:-:-:1 FADD output2, output2, loadBuffer2; +--:-:-:-:3 FADD output3, output3, loadBuffer3; + +//Activation function +//TODO: add others +--:-:-:-:2 FMNMX output0, output0, RZ, !PT; +--:-:-:-:2 FMNMX output1, output1, RZ, !PT; +--:-:-:-:2 FMNMX output2, output2, RZ, !PT; +--:-:-:-:2 FMNMX output3, output3, RZ, !PT; + +--:-:-:-:2 FMNMX output0, output0, reluclip, PT; +--:-:-:-:2 FMNMX output1, output1, reluclip, PT; +--:-:-:-:2 FMNMX output2, output2, reluclip, PT; +--:-:-:-:2 FMNMX output3, output3, reluclip, PT; + +//Conditional store +--:-:-:-:1 @P0 STG.E.CI.128 [hAddr], output; + +//Compute predicate for time unrolling loop +--:-:-:Y:d ISETP.NE.AND P5, PT, timeStep, expectVal, PT; + +//P2 = (tid != 0) +//setVal = 1 +--:-:-:-:1 ISETP.NE.AND P2, PT, tid, RZ, PT; +--:-:-:-:1 MOV expectVal, param_numBlks; +--:-:-:Y:b MOV setVal, 1; + +//Barrier for all blocks +--:-:-:-:f MEMBAR.GL; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:2 SSY SSY_TARGET1; +--:-:-:-:d @P2 SYNC; + +--:-:-:Y:2 ATOM.E.ADD RZ, [lockAddr], setVal; +--:-:-:-:d SYNC; + +SSY_TARGET1: +--:-:-:-:1 SSY SSY_TARGET2; +--:-:-:-:d @P2 SYNC; + +SPINLOCK: +--:-:1:Y:2 LDG.E lockVal, [lockAddr]; +01:-:-:Y:d ISETP.NE.AND P2, PT, lockVal, expectVal, PT; +--:-:-:-:5 @P2 BRA.U SPINLOCK; +--:-:-:-:d SYNC; + +SSY_TARGET2: +--:-:-:-:5 BAR.SYNC 0; + +//Restore select predicates +--:-:-:-:1 R2P PR, predSave, 0x0c; + +//Conditional branch back to beginning of loop +--:-:-:Y:5 @P5 BRA.U UNROLLING_LOOP; + +--:-:-:-:5 EXIT; diff --git a/Kernel/Convolution/Pascal/sconv_bprop_C1_N64.sass b/Kernel/Convolution/Pascal/sconv_bprop_C1_N64.sass new file mode 100644 index 0000000..070db8c --- /dev/null +++ b/Kernel/Convolution/Pascal/sconv_bprop_C1_N64.sass @@ -0,0 +1,600 @@ +# Kernel: sconv_bprop_C32_N64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_lut : 4x<64*4> + + param_I[0] : c[0x0][0x140] + param_I[1] : c[0x0][0x144] + param_E[0] : c[0x0][0x148] + param_E[1] : c[0x0][0x14c] + param_F[0] : c[0x0][0x150] + param_F[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_N : c[0x0][0x15c] + param_K : c[0x0][0x160] + param_D : c[0x0][0x164] + param_H : c[0x0][0x168] + param_W : c[0x0][0x16c] + param_WN : c[0x0][0x170] + param_HWN : c[0x0][0x174] + param_DHWN : c[0x0][0x178] + param_C : c[0x0][0x17c] + param_CRST : c[0x0][0x180] + param_RST : c[0x0][0x184] + param_magic_RST : c[0x0][0x188] + param_shift_RST : c[0x0][0x18c] + param_RS : c[0x0][0x190] + param_magic_RS : c[0x0][0x194] + param_shift_RS : c[0x0][0x198] + param_S : c[0x0][0x19c] + param_magic_S : c[0x0][0x1a0] + param_shift_S : c[0x0][0x1a4] + param_pad_d : c[0x0][0x1a8] + param_pad_h : c[0x0][0x1ac] + param_pad_w : c[0x0][0x1b0] + param_str_d : c[0x0][0x1b4] + param_str_h : c[0x0][0x1b8] + param_str_w : c[0x0][0x1bc] + param_Q : c[0x0][0x1c0] + param_PQ : c[0x0][0x1c4] + param_QN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_MPQN : c[0x0][0x1d0] + param_magic_Q : c[0x0][0x1d4] + param_shift_Q : c[0x0][0x1d8] + param_magic_PQ : c[0x0][0x1dc] + param_shift_PQ : c[0x0][0x1e0] + param_CRST8 : c[0x0][0x1e4] + param_MPQN8 : c[0x0][0x1e8] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkE, blkF, blkMPQ + + 68-119 ~ k<0|4>, tidX, tid1, m, p, q, crst, n, n32, tf<0|4>, te, te<0|4>, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-79 : j0Ex<0-7>, j0Fy<0-7> + 80-95 : j1Ex<0-7>, j1Fy<0-7> + + 96-103 : load0F<0-3>, load4F<0-3> + 104-119 : load0E<0-7>, load4E<0-7> + + 120-123 : track0F<0-1>, track4F<0-1> + 124-127 : track0E<0-1>, track4E<0-1> + + 128-131 ~ writeEs, writeFs, swapBuf, K + 132-136 ~ readEs, readFs, mt, pr, qs + + 68-71 ~ lutStore, sliceI + 72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD + + 72-89 : c<0-7>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1> + 90-132 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkF, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; + + +// tidX = (tid & 7) << 2 +// k = tid >> 3 +01:-:-:-:1 LOP.AND tidX, tid, 7; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 k0, tid, 3; +--:-:-:-:1 IADD k4, k0, 4; + +--:-:-:-:1 MOV K, param_K; + +--:-:-:-:1 STS.128 [RZ], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15; + + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +08:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; + +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, q, param_str_w, RZ; +--:-:-:-:1 IADD mt, mt, -param_pad_d; +--:-:-:-:1 IADD pr, pr, -param_pad_h; +--:-:-:-:1 IADD qs, qs, -param_pad_w; + +// crst = blkF*32 + tidX +// n = blkE*64 + tidX +04:-:-:-:1 ISCADD crst, blkF, tidX, 5; +08:-:-:-:1 ISCADD n, blkE, tidX, 6; +--:-:-:-:1 IADD n32, n, 32; + +// trackF = k*CRST + crst +--:-:-:-:1 XMAD tf0, k0, param_CRST, crst; +--:-:-:-:1 XMAD tf4, k4, param_CRST, crst; +--:-:-:-:1 LEA track0F0.CC, tf0, param_F[0], 2; +--:-:-:-:1 LEA.HI.X track0F1, tf0, param_F[1], RZ, 2; +--:-:-:-:1 LEA track4F0.CC, tf4, param_F[0], 2; +--:-:-:-:1 LEA.HI.X track4F1, tf4, param_F[1], RZ, 2; + +// trackE = k*MPQN + m*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD te, q, param_N, n; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te0, k0, param_MPQN, te; +--:-:-:-:1 XMAD.LO2C te4, k4, param_MPQN, te; +--:-:-:-:1 LEA track0E0.CC, te0, param_E[0], 2; +--:-:-:-:1 LEA.HI.X track0E1, te0, param_E[1], RZ, 2; +--:-:-:-:1 LEA track4E0.CC, te4, param_E[0], 2; +--:-:-:-:1 LEA.HI.X track4E1, te4, param_E[1], RZ, 2; + +// P1 = crst < CRST +// P2 = n < N +// P3 = n+32 < N +--:-:-:-:1 ISETP.LT.AND P1, PT, crst, param_CRST, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, n, param_N, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, n32, param_N, PT; + +// writeFs = (32*k + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, k0, tidX, 5; +--:-:-:-:1 SHL writeFs, writeFs, 2; +// writeEs = (64*k + tidX) * 4 +--:-:-:-:1 ISCADD writeEs, k0, tidX, 6; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x<32*8>, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readEs = ((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readEs, readEs, 4x<32*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, 4x<32*8 + 64*8>; + + +--:-:-:-:0 IADD K, K, -8; + +--:-:-:-:1 @P1 LDG.E.CI load0F0, [track0F + 4x<0>]; +--:-:-:-:1 @P1 LDG.E.CI load0F1, [track0F + 4x<1>]; +--:-:-:-:1 @P1 LDG.E.CI load0F2, [track0F + 4x<2>]; +--:-:1:-:1 @P1 LDG.E.CI load0F3, [track0F + 4x<3>]; + +--:-:-:-:1 @P1 LDG.E.CI load4F0, [track4F + 4x<0>]; +--:-:-:-:1 @P1 LDG.E.CI load4F1, [track4F + 4x<1>]; +--:-:-:-:1 @P1 LDG.E.CI load4F2, [track4F + 4x<2>]; +--:-:2:-:1 @P1 LDG.E.CI load4F3, [track4F + 4x<3>]; + +--:-:-:-:0 ISETP.GT.AND P1, PT, K, RZ, P1; + +--:-:3:-:1 @P2 LDG.E.128 load0E0, [track0E + 4x< 0>]; +--:-:4:-:1 @P3 LDG.E.128 load0E4, [track0E + 4x<32>]; +--:-:5:-:1 @P2 LDG.E.128 load4E0, [track4E + 4x< 0>]; +--:-:6:-:1 @P3 LDG.E.128 load4E4, [track4E + 4x<32>]; + +--:-:-:-:2 ISETP.GT.AND P2, PT, K, RZ, P2; +--:-:-:-:0 ISETP.GT.AND P3, PT, K, RZ, P3; + +01:-:-:-:1 STS.128 [writeFs + 4x<0*32>], load0F; +--:-:-:-:6 IADD track0F0.CC, track0F0, param_CRST8; +--:-:-:-:0 IADD.X track0F1, track0F1, RZ; + +02:-:-:-:1 STS.128 [writeFs + 4x<4*32>], load4F; +--:-:-:-:6 IADD track4F0.CC, track4F0, param_CRST8; +--:-:-:-:0 IADD.X track4F1, track4F1, RZ; + +04:-:-:-:1 STS.128 [writeEs + 4x<0*64 + 0>], load0E0; +08:-:-:-:1 STS.128 [writeEs + 4x<0*64 + 32>], load0E4; +--:-:-:-:6 IADD track0E0.CC, track0E0, param_MPQN8; +--:-:-:-:0 IADD.X track0E1, track0E1, RZ; + +10:-:-:-:1 STS.128 [writeEs + 4x<4*64 + 0>], load4E0; +20:1:-:-:1 STS.128 [writeEs + 4x<4*64 + 32>], load4E4; +--:-:-:-:6 IADD track4E0.CC, track4E0, param_MPQN8; +--:-:-:-:1 IADD.X track4E1, track4E1, RZ; + +01:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD writeFs, writeFs, swapBuf; +--:-:-:-:2 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD K, K, -8; + +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>]; + +--:-:-:-:1 @P1 LDG.E.CI load0F0, [track0F + 4x<0>]; +--:-:-:-:1 @P1 LDG.E.CI load0F1, [track0F + 4x<1>]; +--:-:-:-:1 @P1 LDG.E.CI load0F2, [track0F + 4x<2>]; +--:-:2:-:1 @P1 LDG.E.CI load0F3, [track0F + 4x<3>]; + +--:-:-:-:1 @P1 LDG.E.CI load4F0, [track4F + 4x<0>]; +--:-:-:-:1 @P1 LDG.E.CI load4F1, [track4F + 4x<1>]; +--:-:-:-:1 @P1 LDG.E.CI load4F2, [track4F + 4x<2>]; +--:-:3:-:1 @P1 LDG.E.CI load4F3, [track4F + 4x<3>]; + +--:-:-:-:0 ISETP.GT.AND P1, PT, K, RZ, P1; + +--:-:4:-:1 @P2 LDG.E.128 load0E0, [track0E + 4x< 0>]; +--:-:4:-:1 @P3 LDG.E.128 load0E4, [track0E + 4x<32>]; +--:-:5:-:1 @P2 LDG.E.128 load4E0, [track4E + 4x< 0>]; +--:-:5:-:1 @P3 LDG.E.128 load4E4, [track4E + 4x<32>]; + +--:-:-:-:2 ISETP.GT.AND P2, PT, K, RZ, P2; +--:-:-:-:1 ISETP.GT.AND P3, PT, K, RZ, P3; + +NEXT_8K: +--:-:-:-:1 ISETP.GT.AND P0, PT, K, -8, PT; + + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD K, K, -8;\n", + + j0c12 => "02:2:-:-:1 \@P0 STS.128 [writeFs + 4x<0*32>], load0F;\n", + j0c14 => "--:-:-:-:1 \@P0 IADD track0F0.CC, track0F0, param_CRST8;\n", + j0c19 => "--:-:-:-:1 \@P0 IADD.X track0F1, track0F1, RZ;\n", + + j0c56 => "02:-:-:-:1 \@P1 LDG.E.CI load0F0, [track0F + 4x<0>];\n", + j0c58 => "--:-:-:-:1 \@P1 LDG.E.CI load0F1, [track0F + 4x<1>];\n", + j0c60 => "--:-:-:-:1 \@P1 LDG.E.CI load0F2, [track0F + 4x<2>];\n", + j0c62 => "--:-:2:-:1 \@P1 LDG.E.CI load0F3, [track0F + 4x<3>];\n", + + j2c12 => "04:3:-:-:1 \@P0 STS.128 [writeFs + 4x<4*32>], load4F;\n", + j2c14 => "--:-:-:-:1 \@P0 IADD track4F0.CC, track4F0, param_CRST8;\n", + j2c19 => "--:-:-:-:1 \@P0 IADD.X track4F1, track4F1, RZ;\n", + + j2c56 => "04:-:-:-:1 \@P1 LDG.E.CI load4F0, [track4F + 4x<0>];\n", + j2c58 => "--:-:-:-:1 \@P1 LDG.E.CI load4F1, [track4F + 4x<1>];\n", + j2c60 => "--:-:-:-:1 \@P1 LDG.E.CI load4F2, [track4F + 4x<2>];\n", + j2c62 => "--:-:3:-:1 \@P1 LDG.E.CI load4F3, [track4F + 4x<3>];\n", + + j4c12 => "08:-:-:-:1 \@P0 STS.128 [writeEs + 4x<0*64 + 0>], load0E0;\n", + j4c14 => "--:4:-:-:1 \@P0 STS.128 [writeEs + 4x<0*64 + 32>], load0E4;\n", + j4c16 => "--:-:-:-:1 \@P0 IADD track0E0.CC, track0E0, param_MPQN8;\n", + j4c21 => "--:-:-:-:1 \@P0 IADD.X track0E1, track0E1, RZ;\n", + + j4c60 => "08:-:-:-:1 \@P2 LDG.E.128 load0E0, [track0E + 4x< 0>];\n", + j4c62 => "--:-:4:-:1 \@P3 LDG.E.128 load0E4, [track0E + 4x<32>];\n", + + j6c12 => "10:-:-:-:1 \@P0 STS.128 [writeEs + 4x<4*64 + 0>], load4E0;\n", + j6c14 => "--:5:-:-:1 \@P0 STS.128 [writeEs + 4x<4*64 + 32>], load4E4;\n", + j6c16 => "--:-:-:-:1 \@P0 IADD track4E0.CC, track4E0, param_MPQN8;\n", + j6c21 => "--:-:-:-:1 \@P0 IADD.X track4E1, track4E1, RZ;\n", + + j6c60 => "10:-:-:-:1 \@P2 LDG.E.128 load4E0, [track4E + 4x< 0>];\n", + j6c62 => "--:-:5:-:1 \@P3 LDG.E.128 load4E4, [track4E + 4x<32>];\n", + + j6c63 => "--:-:-:-:1 \@P0 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c8 => "--:-:-:-:1 ISETP.GT.AND P1, PT, K, RZ, P1;\n", + j7c10 => "--:-:-:-:1 ISETP.GT.AND P2, PT, K, RZ, PT;\n", + j7c12 => "--:-:-:-:1 ISETP.GT.AND P3, PT, K, RZ, PT;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_8K;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:-:-:0 MOV warp_cnt, 32; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkF, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +01:-:-:-:6 MOV rst, tid; + +LUT_LOOP: + + +// warp synchronous loop while warp_cnt < RST (c=0) +--:-:-:-:1 ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT; +--:-:-:-:1 IADD warp_cnt, warp_cnt, 32; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = qs + s +// y = pr + r +// z = mt + t +--:-:-:-:1 IADD z, mt, t; +--:-:-:-:1 IADD y, pr, r; +--:-:-:-:1 IADD x, qs, s; +// i = (z*HWN + y*WN + x*N) * 4 +20:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD sliceI, x, param_N, sliceI; +--:-:-:-:1 SHL sliceI, sliceI, 2; +// Bounds check x and y, and make i negative if outside +--:-:-:-:1 ISET.LT.AND x0, x, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW, x, param_W, PT; +--:-:-:-:1 ISET.LT.AND y0, y, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH, y, param_H, PT; +--:-:-:-:1 ISET.LT.AND z0, z, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD, z, param_D, PT; +--:-:-:-:1 LOP3.LUT sliceI, sliceI, x0, xW, 0xfe; + +--:-:-:-:1 LOP3.LUT sliceI, sliceI, y0, yH, 0xfe; +--:-:-:-:1 SHL lutStore, rst, 2; +--:-:-:-:1 IADD rst, rst, 32; + +--:-:-:-:1 LOP3.LUT sliceI, sliceI, z0, zD, 0xfe; +// Store i imgOffset into the shared lookup table +--:6:-:-:1 STS [lutStore + addr_lut], sliceI; + + +--:-:-:-:5 @P0 BRA.U LUT_LOOP; + + + +--:-:-:-:1 MOV RST, param_RST; +--:-:-:-:1 MOV DHWN1, param_DHWN; +--:-:-:-:1 SHL DHWN1, DHWN1, 2; + +--:-:-:-:1 LOP.AND readEs, readEs, 0x7f; +--:-:-:-:1 LOP.AND readFs, readFs, 0x3f; + +// writeCs = ((readIs / 4) * 64 + readEs); +--:-:-:-:1 ISCADD writeCs, readFs, readEs, 4; + +// readCs = (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL readCs, tid31, 2; + +// nn = blkE*64 + tid31; +04:-:-:-:1 ISCADD nn, blkE, tid31, 6; + +// crst = blkF*32 +02:-:-:-:1 SHL crst00, blkF, 5; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + +--:-:-:-:1 LEA trackI0.CC, nn, param_I[0], 2; +--:-:-:-:1 LEA.HI.X trackI1, nn, param_I[1], RZ, 2; + +// n < N +--:-:-:-:1 ISETP.LT.AND P5, PT, nn, param_N, PT; +--:-:-:-:1 IADD nn, nn, 32; +--:-:-:-:1 ISETP.LT.AND P6, PT, nn, param_N, PT; + +--:-:-:-:1 MOV alpha, param_alpha; + + + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:1 IADD crst00, crst00, 12;\n" . + "--:-:-:-:1 IADD crst04, crst04, 12;\n" . + "--:-:-:-:1 IADD crst08, crst08, 12;\n" . + "--:-:-:-:1 IADD crst12, crst12, 12;\n" if $y == 4; + + $out .= sprintf( + "01:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "02:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "04:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "08:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], c4; + +--:-:-:-:1 LDS c0, [readCs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS c1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<1*64 + 00>]; +--:-:-:-:1 LDS c3, [readCs + 4x<1*64 + 32>]; +--:-:-:-:1 LDS c4, [readCs + 4x<2*64 + 00>]; +--:-:-:-:1 LDS c5, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 LDS c6, [readCs + 4x<3*64 + 00>]; +--:-:-:-:1 LDS c7, [readCs + 4x<3*64 + 32>]; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; + +--:-:-:-:1 XMAD.LO2C c00, crst00, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c04, crst04, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c08, crst08, param_magic_RST, RZ; +--:-:-:-:1 XMAD.LO2C c12, crst12, param_magic_RST, RZ; + +--:-:-:-:1 SHR.U32 c00, c00, param_shift_RST; +--:-:-:-:1 SHR.U32 c04, c04, param_shift_RST; +--:-:-:-:1 SHR.U32 c08, c08, param_shift_RST; +--:-:-:-:1 SHR.U32 c12, c12, param_shift_RST; + +--:-:-:-:1 VMAD.U16.U16 lut00, -c00, RST, crst00; +--:-:-:-:1 VMAD.U16.U16 lut04, -c04, RST, crst04; +--:-:-:-:1 VMAD.U16.U16 lut08, -c08, RST, crst08; +--:-:-:-:1 VMAD.U16.U16 lut12, -c12, RST, crst12; + +--:-:-:-:1 SHL lut00, lut00, 2; +--:-:-:-:1 SHL lut04, lut04, 2; +--:-:-:-:1 SHL lut08, lut08, 2; +--:-:-:-:1 SHL lut12, lut12, 2; + +--:-:-:-:1 XMAD.LO2 chan00, DHWN1, c00, RZ; +--:-:-:-:1 XMAD.LO2 chan04, DHWN1, c04, RZ; +--:-:-:-:1 XMAD.LO2 chan08, DHWN1, c08, RZ; +--:-:-:-:1 XMAD.LO2 chan12, DHWN1, c12, RZ; + +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 IADD crst12, crst12, 1; + +--:-:1:-:1 @P0 LDS img00, [lut00 + addr_lut]; +--:-:2:-:1 @P1 LDS img04, [lut04 + addr_lut]; +--:-:3:-:1 @P2 LDS img08, [lut08 + addr_lut]; +--:-:4:-:1 @P3 LDS img12, [lut12 + addr_lut]; + + + +01:-:-:-:1 IADD3 track00I0.CC, trackI0, img00, chan00; +--:-:-:-:5 ISETP.GE.AND P0, PT, img00, RZ, P0; +--:-:-:-:1 IADD.X track00I1, trackI1, RZ; + +02:-:-:-:1 IADD3 track04I0.CC, trackI0, img04, chan04; +--:-:-:-:5 ISETP.GE.AND P1, PT, img04, RZ, P1; +--:-:-:-:1 IADD.X track04I1, trackI1, RZ; + +04:-:-:-:1 IADD3 track08I0.CC, trackI0, img08, chan08; +--:-:-:-:5 ISETP.GE.AND P2, PT, img08, RZ, P2; +--:-:-:-:1 IADD.X track08I1, trackI1, RZ; + +08:-:-:-:1 IADD3 track12I0.CC, trackI0, img12, chan12; +--:-:-:-:5 ISETP.GE.AND P3, PT, img12, RZ, P3; +--:-:-:-:0 IADD.X track12I1, trackI1, RZ; + +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00I], c0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04I], c2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08I], c4; +--:-:-:-:3 PSETP.AND.AND P2, PT, P2, P6, PT; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12I], c6; +--:-:-:-:5 PSETP.AND.AND P3, PT, P3, P6, PT; + +--:1:-:-:2 @P0 RED.E.ADD.F32.FTZ.RN [track00I + 4x<32>], c1; +--:2:-:-:2 @P1 RED.E.ADD.F32.FTZ.RN [track04I + 4x<32>], c3; +--:3:-:-:4 @P2 RED.E.ADD.F32.FTZ.RN [track08I + 4x<32>], c5; +--:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12I + 4x<32>], c7; + +--:-:-:-:5 RET; + diff --git a/Kernel/Convolution/Pascal/sconv_updat_C128_K128.sass b/Kernel/Convolution/Pascal/sconv_updat_C128_K128.sass new file mode 100644 index 0000000..dfb6bea --- /dev/null +++ b/Kernel/Convolution/Pascal/sconv_updat_C128_K128.sass @@ -0,0 +1,718 @@ +# Kernel: sconv_updat_C128_K128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*4 + 0> + addr_m : 4x<(128*16 + 32)*4 + 4> + addr_q : 4x<(128*16 + 32)*4 + 5> + szBuf : (128*16 + 32) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_offset_K : c[0x0][0x15c] + param_N : c[0x0][0x160] + param_K : c[0x0][0x164] + param_D : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_WN : c[0x0][0x174] + param_HWN : c[0x0][0x178] + param_DHWN : c[0x0][0x17c] + param_C : c[0x0][0x180] + param_CRST : c[0x0][0x184] + param_RST : c[0x0][0x188] + param_magic_RST : c[0x0][0x18c] + param_shift_RST : c[0x0][0x190] + param_RS : c[0x0][0x194] + param_magic_RS : c[0x0][0x198] + param_shift_RS : c[0x0][0x19c] + param_S : c[0x0][0x1a0] + param_magic_S : c[0x0][0x1a4] + param_shift_S : c[0x0][0x1a8] + param_pad_d : c[0x0][0x1ac] + param_pad_h : c[0x0][0x1b0] + param_pad_w : c[0x0][0x1b4] + param_str_d : c[0x0][0x1b8] + param_str_h : c[0x0][0x1bc] + param_str_w : c[0x0][0x1c0] + param_dil_d : c[0x0][0x1c4] + param_dil_h : c[0x0][0x1c8] + param_dil_w : c[0x0][0x1cc] + param_P : c[0x0][0x1d0] + param_Q : c[0x0][0x1d4] + param_PQ : c[0x0][0x1d8] + param_QN : c[0x0][0x1dc] + param_PQN : c[0x0][0x1e0] + param_MPQN : c[0x0][0x1e4] + param_magic_Q : c[0x0][0x1e8] + param_shift_Q : c[0x0][0x1ec] + param_magic_PQ : c[0x0][0x1f0] + param_shift_PQ : c[0x0][0x1f4] + param_grid_P : c[0x0][0x1f8] + param_grid_Q : c[0x0][0x1fc] + param_grid_PQ : c[0x0][0x200] + param_CRSTK : c[0x0][0x204] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkI, blkE, one + 68-111 ~ tidX, tidY, tid1, tid7, tid128, shiftX, blkMPQ, m, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-95 ~ tidYY, mm, mt, pr, y, z, y0, yH, z0, zD, bounds_yz, c, r, t, rs, rst + 64-95 ~ qs, x, x0, xW, bounds_x, ti, te, Q + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 96-111 : loadI<0-7>, loadE<0-7> + 112-115 : trackI<0-1>, trackE<0-1> + + 116-124 ~ writeS, loopN, e, i, p, q, k, crst, s + 125-127 ~ swapBuf, readIs, readEs + + 68-83 : c<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 84-124 ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128, blk_MPQ, CRSTK, xmad_determ + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID1; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkI, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; +--:-:-:-:5 BRA.U END_CTAID1; +CTAID1: +--:-:2:-:1 S2R blkMPQ, SR_CTAID.Z; +--:-:3:-:1 S2R blkI, SR_CTAID.X; +--:-:4:-:1 S2R blkE, SR_CTAID.Y; +END_CTAID1: + + +// tidX = tid >> 1 +// tidY = (tid & 1) << 2 +// shiftX = (tid & 1) << 4 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tid1, 2; +--:-:-:-:1 SHL shiftX, tid1, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +--:-:-:-:1 PSETP.AND.AND P0, PT, PT, PT, PT; + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_grid_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_grid_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; + +// We need to be able to restore m and q at each P iteration +// Register spill to shared +--:1:-:-:1 STS [addr_m], m; +--:-:-:-:1 STS [addr_q], q; + +// writeBs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 IADD writeS, writeS, shiftX; +--:-:-:-:1 ISCADD writeS, writeS, 4x, 2; + +// readIs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readIs, tid, 0x70; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readEs, tid128, 4; +--:-:-:-:1 LOP.OR readEs, readEs, tid7; +--:-:-:-:1 ISCADD readEs, readEs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + +// crst = blockI*128 + tid +04:-:-:-:1 ISCADD crst, blkI, tidX, 7; + +// k = blockE*128 + tid +08:-:-:-:1 ISCADD k, blkE, tidX, 7; +--:-:-:-:1 IADD k, k, param_offset_K; + +--:-:-:-:1 MOV loopN, param_N; + + + +NEXT_P: + +01:-:4:-:1 S2R tidYY, SR_TID.X; +--:-:5:-:1 LDS mm, [addr_m]; + + +--:-:6:-:1 LDS q, [addr_q]; + +// c = crst / RST +// rst = crst % RST +--:-:-:-:1 XMAD.LO2C c, crst, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c, c, param_shift_RST; +--:-:-:-:1 XMAD rst, c, param_RST, RZ; +--:-:-:-:1 IADD rst, -rst, crst; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// y = p * u - pad_h + (r * dil_h) +// z = m * w - pad_d + (t * dil_d) +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +10:-:-:-:1 XMAD mt, mm, param_str_d, RZ; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 IADD y, y, -param_pad_h; +--:-:-:-:1 IADD z, z, -param_pad_d; +// e = k*MPQN + m*PQN + p*QN + tidYY +08:-:-:-:1 LOP.AND tidYY, tidYY, 1; +--:-:-:-:1 SHL tidYY, tidYY, 2; +--:-:-:-:1 XMAD.LO2C e, p, param_QN, tidYY; +--:-:-:-:1 XMAD.LO2C e, mm, param_PQN, e; +--:-:-:-:1 XMAD.LO2C e, k, param_MPQN, e; +// i = c*DHWN + z*HWN + y*WN + tidYY +--:-:-:-:1 XMAD.LO2C i, y, param_WN, tidYY; +--:-:-:-:1 XMAD.LO2C i, z, param_HWN, i; +--:-:-:-:1 XMAD.LO2C i, c, param_DHWN, i; +// bounds_yz = y < 0 || y > H || z < 0 || z > D ? -1 : 0 +--:-:-:-:1 ISET.LT.AND y0, y, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH, y, param_H, PT; +--:-:-:-:1 ISET.LT.AND z0, z, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD, z, param_D, PT; +--:-:-:-:1 LOP.OR bounds_yz, y0, yH; +--:-:-:-:1 LOP3.LUT bounds_yz, bounds_yz, z0, zD, 0xfe; +// doLoadCRST = crst < CRST && bounds_yz == 0 +--:-:-:-:1 ISETP.LT.AND P4, PT, crst, param_CRST, PT; +--:-:-:-:1 ISETP.EQ.AND P4, PT, bounds_yz, RZ, P4; +// p += grid_P +--:-:-:-:1 IADD p, p, param_grid_P; + +--:-:-:-:1 ISETP.LT.AND P6, PT, p, param_P, PT; + + +NEXT_Q: + + +// Zigzag q but only if grid_P < P +--:-:-:-:1 LOP.AND.NZ P1, RZ, p, 1; +--:-:-:-:1 MOV Q, param_grid_P; +--:-:-:-:1 ISETP.LT.AND P1, PT, Q, param_P, P1; +--:-:-:-:1 MOV Q, -1; +20:-:-:-:1 @P1 IADD3 Q, -q, param_Q, Q; +--:-:-:-:1 @!P1 MOV Q, q; +// k < K +--:-:-:-:1 ISETP.LT.AND P3, PT, k, param_K, PT; +// qs = q * v - pad_w +// x = qs + (s dil_w) +--:-:-:-:1 XMAD qs, Q, param_str_w, RZ; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 IADD x, x, -param_pad_w; +// bounds_x = x < 0 || x > W ? -1 : 0 +--:-:-:-:1 ISET.LT.AND x0, x, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW, x, param_W, PT; +--:-:-:-:1 LOP.OR bounds_x, x0, xW; +// doLoad = crst < CRST && bounds_yz == 0 && bounds_x == 0 +--:-:-:-:1 ISETP.EQ.AND P2, PT, bounds_x, RZ, P4; +// trackI = I + i + x*N +--:-:-:-:1 XMAD ti, x, param_N, i; +--:-:-:-:1 LEA trackI0.CC, ti, param_I[0], 2; +--:-:-:-:1 LEA.HI.X trackI1, ti, param_I[1], RZ, 2; +// trackE = E + e + q*N +--:-:-:-:1 XMAD te, Q, param_N, e; +--:-:-:-:1 LEA trackE0.CC, te, param_E[0], 2; +--:-:-:-:0 LEA.HI.X trackE1, te, param_E[1], RZ, 2; +// q += grid_Q +--:-:-:-:1 IADD q, q, param_grid_Q; +--:-:-:-:1 ISETP.LT.AND P5, PT, q, param_Q, PT; + +--:-:-:-:1 @!P0 IADD loopN, loopN, param_N; + + + +--:-:-:Y:6 @!P0 BRA.U NEXT_PQ; + +--:-:-:-:0 PSETP.AND.AND P0, PT, PT, PT, !PT; + +--:-:1:-:1 @P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>]; +--:-:2:-:1 @P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>]; +--:-:-:-:1 @!P2 LDS.U.128 loadI0, [addr_zero]; +--:-:5:-:1 @!P2 LDS.U.128 loadI4, [addr_zero]; + +--:-:-:-:0 ISETP.LE.AND P1, PT, loopN, 32, PT; + +--:-:3:-:1 @P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>]; +--:-:4:-:1 @P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>]; +--:-:-:-:1 @!P3 LDS.U.128 loadE0, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 loadE4, [addr_zero]; + +11:-:-:-:1 STS [writeS + 4x< 0*128>], loadI0; +--:-:-:-:1 STS [writeS + 4x< 1*128>], loadI1; +--:-:-:-:1 STS [writeS + 4x< 2*128>], loadI2; +--:-:-:-:1 STS [writeS + 4x< 3*128>], loadI3; + +02:-:-:-:1 STS [writeS + 4x< 8*128 + 16>], loadI4; +--:-:-:-:1 STS [writeS + 4x< 9*128 + 16>], loadI5; +--:-:-:-:1 STS [writeS + 4x<10*128 + 16>], loadI6; +--:-:-:-:1 STS [writeS + 4x<11*128 + 16>], loadI7; + +--:-:-:-:1 IADD trackI0.CC, trackI0, 4x<16>; +--:-:-:-:0 PSETP.AND.AND P5, PT, P1, P5, PT; + +24:-:-:-:1 STS [writeS + 4x< 0*128 + szBuf>], loadE0; +--:-:-:-:1 STS [writeS + 4x< 1*128 + szBuf>], loadE1; +--:-:-:-:1 STS [writeS + 4x< 2*128 + szBuf>], loadE2; +--:-:-:-:1 STS [writeS + 4x< 3*128 + szBuf>], loadE3; + +--:-:-:-:0 PSETP.AND.AND P6, PT, P1, P6, PT; + +08:-:-:-:1 STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4; +--:-:-:-:1 STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5; +--:-:-:-:1 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6; +--:1:-:-:1 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7; + +--:-:-:-:1 IADD.X trackI1, trackI1, RZ; + +--:-:-:-:1 IADD trackE0.CC, trackE0, 4x<16>; + +--:-:-:-:1 IADD readEs, readEs, -swapBuf; +--:-:-:-:0 IADD readIs, readIs, -swapBuf; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackE1, trackE1, RZ; + +--:-:2:-:1 @P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>]; +--:5:2:-:1 @P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>]; +--:-:3:-:1 @P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>]; +--:6:3:-:1 @P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>]; + +10:-:-:-:6 @P2 IADD trackI0.CC, trackI0, 4x<16>; +--:-:-:-:1 @P2 IADD.X trackI1, trackI1, RZ; +20:-:-:-:6 @P3 IADD trackE0.CC, trackE0, 4x<16>; +--:-:-:-:0 @P3 IADD.X trackE1, trackE1, RZ; + +--:-:-:Y:5 @P5 BRA.U NEXT_Q; +--:-:-:Y:5 @P6 BRA.U NEXT_P; + +--:-:-:-:2 ISETP.LT.AND P5, PT, q, param_Q, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, p, param_P, PT; + +NEXT_PQ: + +--:-:1:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*128 + 64>]; +--:-:1:-:2 LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + + +// P0 loop N +// P2 bounds I +// P3 bounds E +// P4 bounds yz +// P5 loop Q +// P6 loop P + +//loop = N >= 16 && (N >= 32 || (!p5 && !p6)) + +NEXT_16N: + + + + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD loopN, loopN, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, loopN, 16, PT;\n", + + j4c8 => "02:-:-:-:1 \@P0 STS [writeS + 4x< 0*128>], loadI0;\n", + j4c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 1*128>], loadI1;\n", + j4c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 2*128>], loadI2;\n", + j4c14 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 3*128>], loadI3;\n", + + j5c8 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 8*128 + 16>], loadI4;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 9*128 + 16>], loadI5;\n", + j5c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x<10*128 + 16>], loadI6;\n", + j5c14 => "--:2:-:-:1 \@P0 STS [writeS + 4x<11*128 + 16>], loadI7;\n", + + j5c16 => "--:-:-:-:1 ISETP.GE.AND P2, PT, loopN, 32, P2;\n", + + j5c60 => "02:-:2:-:1 \@P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>];\n", + j5c62 => "--:4:2:-:1 \@P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>];\n", + + j6c16 => "--:-:-:-:1 \@!P2 LDS.U.128 loadI0, [addr_zero];\n", + j7c16 => "--:-:-:-:1 \@!P2 LDS.U.128 loadI4, [addr_zero];\n", + + j10c57 => "08:-:-:-:1 \@P2 IADD trackI0.CC, trackI0, 4x<16>;\n", + j10c62 => "--:-:-:-:1 \@P2 IADD.X trackI1, trackI1, RZ;\n", + + j12c8 => "04:-:-:-:1 \@P0 STS [writeS + 4x< 0*128 + szBuf>], loadE0;\n", + j12c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 1*128 + szBuf>], loadE1;\n", + j12c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 2*128 + szBuf>], loadE2;\n", + j12c14 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 3*128 + szBuf>], loadE3;\n", + + j13c8 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4;\n", + j13c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5;\n", + j13c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;\n", + j13c14 => "--:3:-:-:1 \@P0 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;\n", + + j13c16 => "--:-:-:-:1 ISETP.GE.AND P3, PT, loopN, 32, P3;\n", + + j13c60 => "04:-:3:-:1 \@P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>];\n", + j13c62 => "--:4:3:-:1 \@P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>];\n", + + j14c16 => "--:-:-:-:1 \@!P3 LDS.U.128 loadE0, [addr_zero];\n", + j15c16 => "--:-:-:-:1 \@!P3 LDS.U.128 loadE4, [addr_zero];\n", + + j15c57 => "08:-:-:-:1 \@P3 IADD trackE0.CC, trackE0, 4x<16>;\n", + j15c62 => "--:-:-:-:1 \@P3 IADD.X trackE1, trackE1, RZ;\n", + + j14c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "20:-:-:-:1 \@P0 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j15c24 => "--:-:-:-:1 ISETP.GT.AND P1, PT, loopN, 32, PT;\n", + j15c37 => "--:-:-:-:1 PSETP.AND.OR P1, PT, !P5, !P6, P1;\n", + j15c50 => "--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, PT;\n", + + j15c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_16N;\n" . + "01:-:-:Y:5 \@P5 BRA.U NEXT_Q;\n" . + "--:-:-:Y:5 \@P6 BRA.U NEXT_P;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID2; +--:-:2:-:1 S2R blkI, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.X; +--:-:-:-:5 BRA.U END_CTAID2; +CTAID2: +--:-:2:-:1 S2R blkI, SR_CTAID.X; +--:-:3:-:1 S2R blkE, SR_CTAID.Y; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.Z; +END_CTAID2: + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readEs, readEs, -4x; +--:-:-:-:1 @P0 IADD readIs, readIs, -swapBuf; +--:-:-:-:1 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 128 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 5; + +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +01:-:-:-:1 LOP.AND t128, tid, 128; + +// kk = tid31 | (t128 >> 2); +--:-:-:-:1 SHR.U32 kk, t128, 2; +--:-:-:-:1 LOP.OR kk, tid31, kk; + +// readCs = ((tid96 << 4) | kk) << 2; +--:-:-:-:1 SHL readCs, tid96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, kk; +--:-:-:-:1 SHL readCs, readCs, 2; + +// kk += blkE*128; +04:-:-:-:1 ISCADD kk, blkE, kk, 7; +--:-:-:-:1 IADD kk, kk, param_offset_K; + +// crst = blkI*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 crst00, tid96, 1; +02:-:-:-:1 ISCADD crst00, blkI, crst00, 7; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K4, K, 4; +--:-:-:-:1 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +--:-:-:-:1 VMAD.U16.U16 tf, crst00, K, kk; +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSTK, param_CRSTK; +08:-:-:-:1 XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ; + }; + } + return ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 0x2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 0x2; + +// kk < K +--:-:-:-:1 ISETP.LT.AND P5, PT, kk, param_K, PT; +--:-:-:-:1 IADD kk, kk, 64; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, PT; + +--:-:-:-:1 MOV alpha, param_alpha; + + + +--:-:-:-:6 IADD track04F0.CC, track00F0, K4; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:6 IADD track08F0.CC, track04F0, K4; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:6 IADD track12F0.CC, track08F0, K4; +--:-:-:-:0 IADD.X track12F1, track08F1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD track00F0.CC, track00F0, K60;\n" . + "--:-:-:-:1 IADD crst00, crst00, 60;\n" . + "--:-:-:-:1 IADD.X track00F1, track00F1, RZ;\n" . + "--:-:-:-:5 IADD track04F0.CC, track04F0, K60;\n" . + "--:-:-:-:1 IADD crst04, crst04, 60;\n" . + "--:-:-:-:1 IADD.X track04F1, track04F1, RZ;\n" . + "--:-:-:-:5 IADD track08F0.CC, track08F0, K60;\n" . + "--:-:-:-:1 IADD crst08, crst08, 60;\n" . + "--:-:-:-:1 IADD.X track08F1, track08F1, RZ;\n" . + "--:-:-:-:5 IADD track12F0.CC, track12F0, K60;\n" . + "--:-:-:-:1 IADD crst12, crst12, 60;\n" . + "--:-:-:-:1 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +--:-:-:-:0 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], c4; + +--:-:1:-:1 LDS c0, [readCs + 4x<0*128 + 00>]; +--:-:2:-:1 LDS c2, [readCs + 4x<1*128 + 00>]; +--:-:3:-:1 LDS c4, [readCs + 4x<2*128 + 00>]; +--:-:4:-:a LDS c6, [readCs + 4x<3*128 + 00>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:-:-:-:1 @P0 STG.E.CG [track00F], c0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 STG.E.CG [track04F], c2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 STG.E.CG [track08F], c4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 STG.E.CG [track12F], c6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } + else + { + return q{ +01:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F], c0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F], c2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F], c4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F], c6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } ++] + +--:-:1:-:1 LDS c1, [readCs + 4x<0*128 + 64>]; +--:-:2:-:1 LDS c3, [readCs + 4x<1*128 + 64>]; +--:-:3:-:1 LDS c5, [readCs + 4x<2*128 + 64>]; +--:-:4:-:a LDS c7, [readCs + 4x<3*128 + 64>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:1:-:-:1 @P0 STG.E.CG [track00F + 4x<64>], c1; +02:2:-:-:1 @P1 STG.E.CG [track04F + 4x<64>], c3; +04:3:-:-:1 @P2 STG.E.CG [track08F + 4x<64>], c5; +08:4:-:-:1 @P3 STG.E.CG [track12F + 4x<64>], c7; + }; + } + else + { + return q{ +01:1:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], c1; +02:2:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], c3; +04:3:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], c5; +08:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], c7; + }; + } ++] + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Pascal/sconv_updat_C128_K64.sass b/Kernel/Convolution/Pascal/sconv_updat_C128_K64.sass new file mode 100644 index 0000000..26cc64c --- /dev/null +++ b/Kernel/Convolution/Pascal/sconv_updat_C128_K64.sass @@ -0,0 +1,818 @@ +# Kernel: sconv_updat_C128_K64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2> + szShareI : (128*16 + 32) + szShareE : (64*16 + 32) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_offset_K : c[0x0][0x15c] + param_N : c[0x0][0x160] + param_K : c[0x0][0x164] + param_D : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_WN : c[0x0][0x174] + param_HWN : c[0x0][0x178] + param_DHWN : c[0x0][0x17c] + param_C : c[0x0][0x180] + param_CRST : c[0x0][0x184] + param_RST : c[0x0][0x188] + param_magic_RST : c[0x0][0x18c] + param_shift_RST : c[0x0][0x190] + param_RS : c[0x0][0x194] + param_magic_RS : c[0x0][0x198] + param_shift_RS : c[0x0][0x19c] + param_S : c[0x0][0x1a0] + param_magic_S : c[0x0][0x1a4] + param_shift_S : c[0x0][0x1a8] + param_pad_d : c[0x0][0x1ac] + param_pad_h : c[0x0][0x1b0] + param_pad_w : c[0x0][0x1b4] + param_str_d : c[0x0][0x1b8] + param_str_h : c[0x0][0x1bc] + param_str_w : c[0x0][0x1c0] + param_dil_d : c[0x0][0x1c4] + param_dil_h : c[0x0][0x1c8] + param_dil_w : c[0x0][0x1cc] + param_P : c[0x0][0x1d0] + param_Q : c[0x0][0x1d4] + param_PQ : c[0x0][0x1d8] + param_QN : c[0x0][0x1dc] + param_PQN : c[0x0][0x1e0] + param_MPQN : c[0x0][0x1e4] + param_magic_Q : c[0x0][0x1e8] + param_shift_Q : c[0x0][0x1ec] + param_magic_PQ : c[0x0][0x1f0] + param_shift_PQ : c[0x0][0x1f4] + param_grid_P : c[0x0][0x1f8] + param_grid_Q : c[0x0][0x1fc] + param_grid_PQ : c[0x0][0x200] + param_CRSTK : c[0x0][0x204] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ tid, blkI, blkE, one + 68-99 ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3 + + 64-72 ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q + 73-99 ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1> + 73-99 ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 100-147 : load0I<00-15>, load1I<00-15>, loadE<00-15> + 148-153 : track0I<0-1>, track1I<0-1>, trackE<0-1> + + 154-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY + 165-167 ~ readIs, readEs, swapBuf + + 68-83 : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ + + + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID1; +--:-:2:-:1 S2R blkMPQ, SR_CTAID.X; +--:-:3:-:1 S2R blkI, SR_CTAID.Y; +--:-:4:-:1 S2R blkE, SR_CTAID.Z; +--:-:-:-:5 BRA.U END_CTAID1; +CTAID1: +--:-:2:-:1 S2R blkMPQ, SR_CTAID.Z; +--:-:3:-:1 S2R blkI, SR_CTAID.X; +--:-:4:-:1 S2R blkE, SR_CTAID.Y; +END_CTAID1: + + +// tidX = tid >> 1 +// tidY = (tid & 1) << 2 +// shiftX = (tid & 1) << 4 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHR.U32 tidX, tid, 1; +--:-:-:-:1 SHL tidY, tid1, 2; +--:-:-:-:1 SHL shiftX, tid1, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +--:-:-:-:1 MOV magicPQ, param_magic_PQ; +--:-:-:-:1 MOV magicQ, param_magic_Q; +--:-:-:-:1 IADD negQ, RZ, -param_grid_Q; +--:-:-:-:1 IADD negPQ, RZ, -param_grid_PQ; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magicPQ, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magicQ, 1, PT; + +// m = blkMPQ / PQ +02:-:-:-:1 @P1 XMAD div1, blkMPQ, magicPQ, RZ; +--:-:-:-:1 @P1 XMAD div2, blkMPQ, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blkMPQ.H1, magicPQ.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ, div1; +--:-:-:-:1 @P1 IADD3.RS m, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 m, m, param_shift_PQ; +--:-:-:-:1 @!P1 SHR.U32 m, blkMPQ, param_shift_PQ; + +// pq = blkMPQ % PQ +--:-:-:-:1 XMAD.LO2 pq, negPQ, m, blkMPQ; + +// p = blockPQ / Q +--:-:-:-:1 @P2 XMAD div1, pq, magicQ, RZ; +--:-:-:-:1 @P2 XMAD div2, pq, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, pq.H1, magicQ.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, pq.H1, magicQ, div1; +--:-:-:-:1 @P2 IADD3.RS p, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 p, p, param_shift_Q; +--:-:-:-:1 @!P2 SHR.U32 p, pq, param_shift_Q; + +// q = blockPQ % Q +--:-:-:-:1 XMAD.S16.S16 q, negQ, p, pq; +--:-:-:-:1 MOV qq, q; + +// writeIs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 7; +--:-:-:-:1 IADD writeIs, writeIs, shiftX; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +// writeEs = (tidY*64 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeEs, tidY, tidX, 6; +--:-:-:-:1 IADD writeEs, writeEs, shiftX; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x, 2; + +// readIs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readIs, tid, -16; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; +--:-:-:-:1 SHL readIs, readIs, 4; +// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +--:-:-:-:1 BFE.U32 readEs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readEs, readEs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + +// crst = blockI*128 + tid +04:-:-:-:1 ISCADD crst0, blkI, tidX, 7; +--:-:-:-:1 IADD crst1, crst0, 64; + +// k = blockE*64 + tid +08:-:-:-:1 ISCADD k, blkE, tidX, 6; +--:-:-:-:1 IADD k, k, param_offset_K; + +--:-:-:-:1 MOV loopN, RZ; + +--:-:-:-:1 PSETP.AND.AND P0, PT, PT, PT, PT; + + +NEXT_PQ: + + +// Zigzag q but only if grid_P < P +--:-:-:-:1 LOP.AND.NZ P1, RZ, p, 1; +--:-:-:-:1 MOV Q, param_grid_P; +--:-:-:-:1 ISETP.LT.AND P1, PT, Q, param_P, P1; +--:-:-:-:1 MOV Q, -1; +--:-:-:-:1 @P1 IADD3 Q, -q, param_Q, Q; +--:-:-:-:1 @!P1 MOV Q, q; +// c = crst / RST +// rst = crst % RST +--:-:-:-:1 XMAD.LO2C c0, crst0, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c0, c0, param_shift_RST; +--:-:-:-:1 XMAD rst0, c0, param_RST, RZ; +--:-:-:-:1 IADD rst0, -rst0, crst0; +--:-:-:-:1 XMAD.LO2C c1, crst1, param_magic_RST, RZ; +--:-:-:-:1 SHR.U32 c1, c1, param_shift_RST; +--:-:-:-:1 XMAD rst1, c1, param_RST, RZ; +--:-:-:-:1 IADD rst1, -rst1, crst1; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t0, rst0, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t0, t0, param_shift_RS; +--:-:-:-:1 XMAD rs0, t0, param_RS, RZ; +--:-:-:-:1 IADD rs0, -rs0, rst0; +--:-:-:-:1 XMAD.LO2C t1, rst1, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t1, t1, param_shift_RS; +--:-:-:-:1 XMAD rs1, t1, param_RS, RZ; +--:-:-:-:1 IADD rs1, -rs1, rst1; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r0, rs0, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r0, r0, param_shift_S; +--:-:-:-:1 XMAD s0, r0, param_S, RZ; +--:-:-:-:1 IADD s0, -s0, rs0; +--:-:-:-:1 XMAD.LO2C r1, rs1, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r1, r1, param_shift_S; +--:-:-:-:1 XMAD s1, r1, param_S, RZ; +--:-:-:-:1 IADD s1, -s1, rs1; +// z = m * w - pad_d + (t * dil_d) +// y = p * u - pad_h + (r * dil_h) +// x = q * v - pad_w + (s * dil_w) +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, Q, param_str_w, RZ; +--:-:-:-:1 XMAD z1, t1, param_dil_d, mt; +--:-:-:-:1 XMAD y1, r1, param_dil_h, pr; +--:-:-:-:1 XMAD x1, s1, param_dil_w, qs; +--:-:-:-:1 XMAD z0, t0, param_dil_d, mt; +--:-:-:-:1 XMAD y1, r0, param_dil_h, pr; +--:-:-:-:1 XMAD x1, s0, param_str_w, qs; +--:-:-:-:1 IADD z1, z1, -param_pad_d; +--:-:-:-:1 IADD y1, y1, -param_pad_h; +--:-:-:-:1 IADD x1, x1, -param_pad_w; +--:-:-:-:1 IADD z0, z0, -param_pad_d; +--:-:-:-:1 IADD y0, y0, -param_pad_h; +--:-:-:-:1 IADD x0, x0, -param_pad_w; + + +// Split blocks to fit inside of 36 registers + +// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY +--:-:-:-:1 XMAD.LO2C ti0, c0, param_DHWN, tidY; +--:-:-:-:1 XMAD.LO2C ti0, z0, param_HWN, ti0; +--:-:-:-:1 XMAD.LO2C ti0, y0, param_WN, ti0; +--:-:-:-:1 XMAD ti0, x0, param_N, ti0; +--:-:-:-:1 XMAD.LO2C ti1, c1, param_DHWN, tidY; +--:-:-:-:1 XMAD.LO2C ti1, z1, param_HWN, ti1; +--:-:-:-:1 XMAD.LO2C ti1, y1, param_WN, ti1; +--:-:-:-:1 XMAD ti1, x1, param_N, ti1; +--:-:-:-:1 LEA track0I0.CC, ti0, param_I[0], 2; +--:-:-:-:1 LEA.HI.X track0I1, ti0, param_I[1], RZ, 2; +--:-:-:-:1 LEA track1I0.CC, ti1, param_I[0], 2; +--:-:-:-:1 LEA.HI.X track1I1, ti1, param_I[1], RZ, 2; + +// trackE = k*MPQN + m*PQN + p*QN + tidY +--:-:-:-:1 XMAD.LO2C te, k, param_MPQN, tidY; +--:-:-:-:1 XMAD.LO2C te, m, param_PQN, te; +--:-:-:-:1 XMAD.LO2C te, p, param_QN, te; +--:-:-:-:1 XMAD te, Q, param_N, te; +--:-:-:-:1 LEA trackE0.CC, te, param_E[0], 2; +--:-:-:-:0 LEA.HI.X trackE1, te, param_E[1], RZ, 2; + +// Bounds check x,y,z,c for each I track. +// If out of bounds, this will set the track address to -1 +--:-:-:-:1 ISET.GE.AND cC0, c0, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd0, z0, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD0, z0, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh0, y0, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH0, y0, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw0, x0, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW0, x0, param_W, PT; +--:-:-:-:1 LOP.OR track0I0, track0I0, cC0; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe; +--:-:-:-:1 LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe; + +--:-:-:-:1 ISET.GE.AND cC1, c1, param_C, PT; +--:-:-:-:1 ISET.LT.AND zd1, z1, RZ, PT; +--:-:-:-:1 ISET.GE.AND zD1, z1, param_D, PT; +--:-:-:-:1 ISET.LT.AND yh1, y1, RZ, PT; +--:-:-:-:1 ISET.GE.AND yH1, y1, param_H, PT; +--:-:-:-:1 ISET.LT.AND xw1, x1, RZ, PT; +--:-:-:-:1 ISET.GE.AND xW1, x1, param_W, PT; +--:-:-:-:1 LOP.OR track1I0, track1I0, cC1; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe; +--:-:-:-:1 LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:1 ISETP.NE.AND P2, PT, track0I0, -1, PT; +--:-:-:-:1 ISETP.NE.AND P3, PT, track1I0, -1, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, k, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:-:1 IADD loopN, loopN, param_N; + + +--:-:-:Y:5 @P0 BRA.U FIRST_LOAD; + +INIT_LOOP: + +--:-:1:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>]; +--:-:1:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>]; +--:-:1:-:2 LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>]; + +NEXT_16N: + + + + my %insert = + ( + j0c8 => "--:-:-:-:1 IADD loopN, loopN, -16;\n", + + # p0 = (N & 16) == 0 + # p1 = N >= 32 && p0 + j0c14 => "--:-:-:-:1 LOP.AND.NZ P0, RZ, loopN, 16;\n", + j0c28 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 32, P0;\n", + + + j1c8 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 0*128 + 0 + 0>], load0I08;\n", + j1c10 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 1*128 + 0 + 0>], load0I09;\n", + j1c12 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 2*128 + 0 + 0>], load0I10;\n", + j1c14 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 3*128 + 0 + 0>], load0I11;\n", + j1c16 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 8*128 + 0 + 16>], load0I12;\n", + j1c18 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 9*128 + 0 + 16>], load0I13;\n", + j1c20 => "--:-:-:-:1 \@P0 STS [writeIs + 4x<10*128 + 0 + 16>], load0I14;\n", + j1c22 => "--:-:-:-:1 \@P0 STS [writeIs + 4x<11*128 + 0 + 16>], load0I15;\n", + + j2c8 => "02:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 + 0 + 0>], load0I00;\n", + j2c10 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 + 0 + 0>], load0I01;\n", + j2c12 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 + 0 + 0>], load0I02;\n", + j2c14 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 + 0 + 0>], load0I03;\n", + j2c16 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 + 0 + 16>], load0I04;\n", + j2c18 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 + 0 + 16>], load0I05;\n", + j2c20 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 + 0 + 16>], load0I06;\n", + j2c22 => "--:2:-:-:1 \@!P0 STS [writeIs + 4x<11*128 + 0 + 16>], load0I07;\n", + + j2c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, track0I0, -1, P1;\n", + j2c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n", + + j3c8 => "02:-:-:-:1 \@P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>];\n", + j3c10 => "--:-:-:-:1 \@P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>];\n", + j3c12 => "--:-:-:-:1 \@P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>];\n", + j3c14 => "--:5:2:-:1 \@P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>];\n", + + j4c8 => "--:-:-:-:1 \@P3 LDS.U.128 load0I00, [addr_zero];\n", + j4c10 => "--:-:-:-:1 \@P3 LDS.U.128 load0I04, [addr_zero];\n", + j5c8 => "--:-:-:-:1 \@P3 LDS.U.128 load0I08, [addr_zero];\n", + j5c10 => "--:-:-:-:1 \@P3 LDS.U.128 load0I12, [addr_zero];\n", + + j5c57 => "10:-:-:-:1 \@P2 IADD track0I0.CC, track0I0, 4x<32>;\n", + j5c62 => "--:-:-:-:1 \@P2 IADD.X track0I1, track0I1, RZ;\n", + + j6c8 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 0*128 + 64 + 0>], load1I08;\n", + j6c10 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 1*128 + 64 + 0>], load1I09;\n", + j6c12 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 2*128 + 64 + 0>], load1I10;\n", + j6c14 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 3*128 + 64 + 0>], load1I11;\n", + j6c16 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I12;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I13;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I14;\n", + j6c22 => "--:-:-:-:1 \@P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I15;\n", + + j7c8 => "04:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 + 64 + 0>], load1I00;\n", + j7c10 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 + 64 + 0>], load1I01;\n", + j7c12 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 + 64 + 0>], load1I02;\n", + j7c14 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 + 64 + 0>], load1I03;\n", + j7c16 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04;\n", + j7c18 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05;\n", + j7c20 => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I06;\n", + j7c22 => "--:3:-:-:1 \@!P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I07;\n", + + j7c24 => "--:-:-:-:1 ISETP.NE.AND P2, PT, track1I0, -1, P1;\n", + j7c26 => "--:-:-:-:1 ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n", + + j8c8 => "04:-:-:-:1 \@P2 LDG.E.CI.128 load1I00, [track1I + 4x< 0>];\n", + j8c10 => "--:-:-:-:1 \@P2 LDG.E.CI.128 load1I04, [track1I + 4x< 8>];\n", + j8c12 => "--:-:-:-:1 \@P2 LDG.E.CI.128 load1I08, [track1I + 4x<16>];\n", + j8c14 => "--:5:3:-:1 \@P2 LDG.E.CI.128 load1I12, [track1I + 4x<24>];\n", + + j9c8 => "--:-:-:-:1 \@P3 LDS.U.128 load1I00, [addr_zero];\n", + j9c10 => "--:-:-:-:1 \@P3 LDS.U.128 load1I04, [addr_zero];\n", + j10c8 => "--:-:-:-:1 \@P3 LDS.U.128 load1I08, [addr_zero];\n", + j10c10 => "--:-:-:-:1 \@P3 LDS.U.128 load1I12, [addr_zero];\n", + + j10c57 => "10:-:-:-:1 \@P2 IADD track1I0.CC, track1I0, 4x<32>;\n", + j10c62 => "--:-:-:-:1 \@P2 IADD.X track1I1, track1I1, RZ;\n", + + + j11c8 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 0*64 + 0>], loadE08;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 1*64 + 0>], loadE09;\n", + j11c12 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 2*64 + 0>], loadE10;\n", + j11c14 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 3*64 + 0>], loadE11;\n", + j11c16 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 8*64 + 16>], loadE12;\n", + j11c18 => "--:-:-:-:1 \@P0 STS [writeEs + 4x< 9*64 + 16>], loadE13;\n", + j11c20 => "--:-:-:-:1 \@P0 STS [writeEs + 4x<10*64 + 16>], loadE14;\n", + j11c22 => "--:-:-:-:1 \@P0 STS [writeEs + 4x<11*64 + 16>], loadE15;\n", + + j12c8 => "08:-:-:-:1 \@!P0 STS [writeEs + 4x< 0*64 + 0>], loadE00;\n", + j12c10 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 1*64 + 0>], loadE01;\n", + j12c12 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 2*64 + 0>], loadE02;\n", + j12c14 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 3*64 + 0>], loadE03;\n", + j12c16 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 8*64 + 16>], loadE04;\n", + j12c18 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 9*64 + 16>], loadE05;\n", + j12c20 => "--:-:-:-:1 \@!P0 STS [writeEs + 4x<10*64 + 16>], loadE06;\n", + j12c22 => "--:4:-:-:1 \@!P0 STS [writeEs + 4x<11*64 + 16>], loadE07;\n", + + j12c24 => "--:-:-:-:1 ISETP.LT.AND P2, PT, k, param_K, P1;\n", + + j13c8 => "08:-:-:-:1 \@P2 LDG.E.CI.128 loadE00, [trackE + 4x< 0>];\n", + j13c10 => "--:-:-:-:1 \@P2 LDG.E.CI.128 loadE04, [trackE + 4x< 8>];\n", + j13c12 => "--:-:-:-:1 \@P2 LDG.E.CI.128 loadE08, [trackE + 4x<16>];\n", + j13c14 => "--:5:4:-:1 \@P2 LDG.E.CI.128 loadE12, [trackE + 4x<24>];\n", + + j15c57 => "10:-:-:-:1 \@P2 IADD trackE0.CC, trackE0, 4x<32>;\n", + j15c62 => "--:-:-:-:1 \@P2 IADD.X trackE1, trackE1, RZ;\n", + + # p0 = N >= 16 and not (N == 32 and (p or q)) + j14c8 => "--:-:-:-:1 ISETP.EQ.AND P0, PT, loopN, 32, PT;\n", + j14c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, loopN, 16, PT;\n", + j14c22 => "--:-:-:-:1 PSETP.OR.AND P0, PT, P5, P6, P0;\n", + j14c35 => "--:-:-:-:1 PSETP.AND.AND P0, PT, !P0, P1, PT;\n", + + j14c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "20:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j15c63 => "--:-:-:Y:5 \@P0 BRA.U NEXT_16N;\n" . + "--:-:-:-:0 \@P5 IADD q, q, param_grid_Q;\n" . + "01:-:-:Y:5 \@P5 BRA.U NEXT_PQ;\n" . + "--:-:-:-:1 \@P6 MOV q, qq;\n" . + "--:-:-:-:0 \@P6 IADD p, p, param_grid_P;\n" . + "--:-:-:Y:5 \@P6 BRA.U NEXT_PQ;\n" . + "--:-:-:Y:5 BRA.U FINISH;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out; + foreach my $j (0 .. 15) + { + my $odd = $j & 1; + my $nOdd = 1 - $odd; + my $rsOffset = ($j + 1) & 15; + my $rsPred = $j == 15 ? '@P0' : ' '; + my $shift = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2; + my $barrier = $j == 14 ? '6' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:%s:1:-:1 %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +FIRST_LOAD: + +--:-:-:-:0 PSETP.AND.AND P0, PT, PT, PT, !PT; + +--:-:-:-:1 @P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>]; +--:-:-:-:1 @P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>]; +--:-:-:-:1 @P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>]; +--:-:1:-:1 @P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>]; +--:-:-:-:1 @!P2 LDS.U.128 load0I00, [addr_zero]; +--:-:-:-:1 @!P2 LDS.U.128 load0I04, [addr_zero]; +--:-:-:-:1 @!P2 LDS.U.128 load0I08, [addr_zero]; +--:-:4:-:1 @!P2 LDS.U.128 load0I12, [addr_zero]; + +// p1 = N == 32 and (p or q) +--:-:-:-:0 ISETP.EQ.AND P1, PT, loopN, 32, PT; + +--:-:-:-:1 @P3 LDG.E.CI.128 load1I00, [track1I + 4x< 0>]; +--:-:-:-:1 @P3 LDG.E.CI.128 load1I04, [track1I + 4x< 8>]; +--:-:-:-:1 @P3 LDG.E.CI.128 load1I08, [track1I + 4x<16>]; +--:-:2:-:1 @P3 LDG.E.CI.128 load1I12, [track1I + 4x<24>]; +--:-:-:-:1 @!P3 LDS.U.128 load1I00, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.128 load1I04, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.128 load1I08, [addr_zero]; +--:-:5:-:1 @!P3 LDS.U.128 load1I12, [addr_zero]; + +--:-:-:-:1 @P4 LDG.E.CI.128 loadE00, [trackE + 4x< 0>]; +--:-:-:-:1 @P4 LDG.E.CI.128 loadE04, [trackE + 4x< 8>]; +--:-:-:-:1 @P4 LDG.E.CI.128 loadE08, [trackE + 4x<16>]; +--:-:3:-:1 @P4 LDG.E.CI.128 loadE12, [trackE + 4x<24>]; +--:-:-:-:1 @!P4 LDS.U.128 loadE00, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 loadE04, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 loadE08, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 loadE12, [addr_zero]; + +--:-:-:-:0 PSETP.OR.AND P1, PT, P5, P6, P1; + +09:-:-:-:1 STS [writeIs + 4x< 0*128 + 0 + 0>], load0I00; +--:-:-:-:1 STS [writeIs + 4x< 1*128 + 0 + 0>], load0I01; +--:-:-:-:1 STS [writeIs + 4x< 2*128 + 0 + 0>], load0I02; +--:-:-:-:1 STS [writeIs + 4x< 3*128 + 0 + 0>], load0I03; +--:-:-:-:1 STS [writeIs + 4x< 8*128 + 0 + 16>], load0I04; +--:-:-:-:1 STS [writeIs + 4x< 9*128 + 0 + 16>], load0I05; +--:-:-:-:1 STS [writeIs + 4x<10*128 + 0 + 16>], load0I06; +--:-:-:-:1 STS [writeIs + 4x<11*128 + 0 + 16>], load0I07; + +--:-:-:-:6 @P2 IADD track0I0.CC, track0I0, 4x<32>; +--:-:-:-:0 @P2 IADD.X track0I1, track0I1, RZ; + +12:-:-:-:1 STS [writeIs + 4x< 0*128 + 64 + 0>], load1I00; +--:-:-:-:1 STS [writeIs + 4x< 1*128 + 64 + 0>], load1I01; +--:-:-:-:1 STS [writeIs + 4x< 2*128 + 64 + 0>], load1I02; +--:-:-:-:1 STS [writeIs + 4x< 3*128 + 64 + 0>], load1I03; +--:-:-:-:1 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04; +--:-:-:-:1 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05; +--:-:-:-:1 STS [writeIs + 4x<10*128 + 64 + 16>], load1I06; +--:-:-:-:1 STS [writeIs + 4x<11*128 + 64 + 16>], load1I07; + +--:-:-:-:3 @P3 IADD track1I0.CC, track1I0, 4x<32>; +--:-:-:-:2 PSETP.AND.AND P5, PT, P5, P1, PT; +--:-:-:-:1 PSETP.AND.AND P6, PT, P6, P1, PT; +--:-:-:-:0 @P3 IADD.X track1I1, track1I1, RZ; + +24:-:-:-:1 STS [writeEs + 4x< 0*64 + 0>], loadE00; +--:-:-:-:1 STS [writeEs + 4x< 1*64 + 0>], loadE01; +--:-:-:-:1 STS [writeEs + 4x< 2*64 + 0>], loadE02; +--:-:-:-:1 STS [writeEs + 4x< 3*64 + 0>], loadE03; +--:-:-:-:1 STS [writeEs + 4x< 8*64 + 16>], loadE04; +--:-:-:-:1 STS [writeEs + 4x< 9*64 + 16>], loadE05; +--:-:-:-:1 STS [writeEs + 4x<10*64 + 16>], loadE06; +--:1:-:-:1 STS [writeEs + 4x<11*64 + 16>], loadE07; + +--:-:-:-:6 @P4 IADD trackE0.CC, trackE0, 4x<32>; +--:-:-:-:1 @P4 IADD.X trackE1, trackE1, RZ; + +--:-:-:-:1 IADD readEs, readEs, -swapBuf; +--:-:-:-:0 IADD readIs, readIs, -swapBuf; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 IADD nextQ, q, param_grid_Q; +--:-:-:-:1 IADD nextP, p, param_grid_P; + +--:-:-:-:0 @P5 IADD q, q, param_grid_Q; +--:-:-:Y:5 @P5 BRA.U NEXT_PQ; +--:-:-:-:0 @P6 IADD p, p, param_grid_P; +--:-:-:Y:5 @P6 BRA.U NEXT_PQ; + +--:-:-:-:2 ISETP.LT.AND P5, PT, nextQ, param_Q, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, nextP, param_P, PT; + +--:-:-:Y:5 BRA.U INIT_LOOP; + + +FINISH: + +--:-:-:-:0 MOV one, 1; +--:-:1:-:6 S2R tid, SR_TID.X; +--:-:-:Y:d ISETP.EQ.AND P0, PT, one, param_RST, PT; +--:-:-:-:5 @P0 BRA.U CTAID2; +--:-:2:-:1 S2R blkI, SR_CTAID.Y; +--:-:3:-:1 S2R blkE, SR_CTAID.Z; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.X; +--:-:-:-:5 BRA.U END_CTAID2; +CTAID2: +--:-:2:-:1 S2R blkI, SR_CTAID.X; +--:-:3:-:1 S2R blkE, SR_CTAID.Y; +--:-:4:-:1 S2R blk_MPQ, SR_CTAID.Z; +END_CTAID2: + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readEs, readEs, -4x; +--:-:-:-:1 @P0 IADD readIs, readIs, -swapBuf; +--:-:-:-:1 @P0 IADD readEs, readEs, -swapBuf; + +// writeCs = (readIs / 4) * 64 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 4; + + +// readCs = ((tid & 96) << 3) | (tid & 31) +01:-:-:-:1 LOP.AND tid31, tid, 31; +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + + +// kk = blkE*64 + tid31; +04:-:-:-:1 ISCADD kk, blkE, tid31, 6; +--:-:-:-:1 IADD kk, kk, param_offset_K; + + +// crst = blkI*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 crst00, tid96, 1; +02:-:-:-:1 ISCADD crst00, blkI, crst00, 7; +--:-:-:-:1 IADD crst04, crst00, 4; +--:-:-:-:1 IADD crst08, crst00, 8; +--:-:-:-:1 IADD crst12, crst00, 12; + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K4, K, 4; +--:-:-:-:1 ISCADD K60, K, -K4, 8; + +// trackF += crst*K + k; +--:-:-:-:1 VMAD.U16.U16 tf, crst00, K, kk; +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSTK, param_CRSTK; +08:-:-:-:1 XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ; + }; + } + return ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 0x2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 0x2; + +--:-:-:-:1 MOV alpha, param_alpha; + +// kk < K +--:-:-:-:1 ISETP.LT.AND P5, PT, kk, param_K, PT; +--:-:-:-:1 IADD kk, kk, 32; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, PT; + + + +--:-:-:-:6 IADD track04F0.CC, track00F0, K4; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:6 IADD track08F0.CC, track04F0, K4; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:6 IADD track12F0.CC, track08F0, K4; +--:-:-:-:1 IADD.X track12F1, track08F1, RZ; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD track00F0.CC, track00F0, K60;\n" . + "--:-:-:-:1 IADD crst00, crst00, 60;\n" . + "--:-:-:-:1 IADD.X track00F1, track00F1, RZ;\n" . + "--:-:-:-:5 IADD track04F0.CC, track04F0, K60;\n" . + "--:-:-:-:1 IADD crst04, crst04, 60;\n" . + "--:-:-:-:1 IADD.X track04F1, track04F1, RZ;\n" . + "--:-:-:-:5 IADD track08F0.CC, track08F0, K60;\n" . + "--:-:-:-:1 IADD crst08, crst08, 60;\n" . + "--:-:-:-:1 IADD.X track08F1, track08F1, RZ;\n" . + "--:-:-:-:5 IADD track12F0.CC, track12F0, K60;\n" . + "--:-:-:-:1 IADD crst12, crst12, 60;\n" . + "--:-:-:-:1 IADD.X track12F1, track12F1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL f0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL f1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL f2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL f3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL f4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL f5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL f6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL f7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K +--:-:-:-:0 IADD crst12, crst12, 1; + +// Warp shuffle to drop the awkward readAs/readBs mapping +--:-:-:-:1 STS.128 [writeCs+4x<00>], f0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], f4; + +--:-:1:-:1 LDS f0, [readCs + 4x<0*64 + 00>]; +--:-:2:-:1 LDS f2, [readCs + 4x<1*64 + 00>]; +--:-:3:-:1 LDS f4, [readCs + 4x<2*64 + 00>]; +--:-:4:-:1 LDS f6, [readCs + 4x<3*64 + 00>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:-:-:-:1 @P0 STG.E.CG [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 STG.E.CG [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 STG.E.CG [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 STG.E.CG [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } + else + { + return q{ +01:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P6, PT; +02:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2; +--:-:-:-:1 PSETP.AND.AND P1, PT, P1, P6, PT; +04:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4; +--:-:-:-:1 PSETP.AND.AND P2, PT, P2, P6, PT; +08:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6; +--:-:-:-:1 PSETP.AND.AND P3, PT, P3, P6, PT; + }; + } ++] + +--:-:1:-:1 LDS f1, [readCs + 4x<0*64 + 32>]; +--:-:2:-:1 LDS f3, [readCs + 4x<1*64 + 32>]; +--:-:3:-:1 LDS f5, [readCs + 4x<2*64 + 32>]; +--:-:4:-:1 LDS f7, [readCs + 4x<3*64 + 32>]; + +[+ + our $determ; + if ($determ) + { + return q{ +01:1:-:-:1 @P0 STG.E.CG [track00F + 4x<32>], f1; +02:2:-:-:1 @P1 STG.E.CG [track04F + 4x<32>], f3; +04:3:-:-:1 @P2 STG.E.CG [track08F + 4x<32>], f5; +08:4:-:-:1 @P3 STG.E.CG [track12F + 4x<32>], f7; + }; + } + else + { + return q{ +01:1:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1; +02:2:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3; +04:3:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5; +08:4:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7; + }; + } ++] + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X128_N128.sass b/Kernel/Convolution/Pascal/sconv_xprop_X128_N128.sass new file mode 100644 index 0000000..8f91aba --- /dev/null +++ b/Kernel/Convolution/Pascal/sconv_xprop_X128_N128.sass @@ -0,0 +1,233 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 128; + our $shareF = 128; + our $stepI = 32; + our $stepF = 64; +-] + + + + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + szShareF : (128*8) + szShareI : (128*8) + + addr_zero : 4x<128*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<128*8*2 + 128*8*2 + 4> + addr_m : 4x<128*8*2 + 128*8*2 + 4> + addr_p : 4x<128*8*2 + 128*8*2 + 5> + addr_q : 4x<128*8*2 + 128*8*2 + 6> + addr_k : 4x<128*8*2 + 128*8*2 + 7> + addr_szLut : 4x<128*8*2 + 128*8*2 + 8> + addr_lut : 4x<128*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne + 72-111 ~ tid1, tid128, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-107 : loadI<0-3>, loadF<0-3> + + 108-111 ~ offsetF, offsetI, offsetFc, offsetIc + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 114-122 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset + 123-127 ~ readFs, readIs, tid, idx_N + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-122 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidX = (tid & 31) << 2 +// tidY = tid >> 5 +--:-:-:-:1 LOP.AND tidX, tid, 31; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 5; + +// trackF += blkF*128 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 7; + +// trackI += blkI*128 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 7; + +// writeS = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 SHL writeS, writeS, 2; + +// readFs = ((tid & 112) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 112; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7) +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 SHR.U32 tid128, tid128, 3; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid128; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF, [trackF]; +--:-:5:-:1 @!P1 LDS.U.128 loadF, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero]; + +12:-:-:-:1 STS.128 [writeS], loadF; +24:1:-:-:1 STS.128 [writeS + 4x], loadI; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF, [trackF]; +--:-:3:-:1 @P1 LDG.E.128 loadI, [trackI]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c40 => "02:2:-:-:1 \@P0 STS.128 [writeS], loadF;\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "20:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "02:-:2:-:1 \@P1 LDG.E.CI.128 loadF, [trackF];\n", + + + j6c8 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x], loadI;\n", + + j6c54 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j6c59 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j6c61 => "04:-:3:-:1 \@P1 LDG.E.128 loadI, [trackI];\n", + + j6c62 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; + + + +// tidOX = (tid & 7) << 2 + (tid & 128) >> 1 +// tidOY = (tid & 127) >> 3 +--:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 LOP.AND tidOX2, tid, 128; +--:-:-:-:1 SHR.U32 tidOX2, tidOX2, 1; +--:-:-:-:1 LOP.OR tidOX, tidOX, tidOX2; +--:-:-:-:1 LOP.AND tidOY, tid, 127; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 LOP.AND readIs, readIs, 0x1ff; +--:-:-:-:1 LOP.AND readFs, readFs, 0x0ff; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +--:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*128 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 7; + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X128_N64.sass b/Kernel/Convolution/Pascal/sconv_xprop_X128_N64.sass new file mode 100644 index 0000000..d7bd0a1 --- /dev/null +++ b/Kernel/Convolution/Pascal/sconv_xprop_X128_N64.sass @@ -0,0 +1,246 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 64; + our $shareF = 128; + our $stepI = 32; + our $stepF = 64; +-] + + + + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + szShareF : (128*8) + szShareI : (64*8) + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 128*8*2 + 4> + addr_m : 4x<64*8*2 + 128*8*2 + 4> + addr_p : 4x<64*8*2 + 128*8*2 + 5> + addr_q : 4x<64*8*2 + 128*8*2 + 6> + addr_k : 4x<64*8*2 + 128*8*2 + 7> + addr_szLut : 4x<64*8*2 + 128*8*2 + 8> + addr_lut : 4x<64*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-111 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-111 : loadI<0-3>, loadF<0-7> + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 104-107 ~ offsetF, offsetIc, offsetFc + + 114-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI + 125-127 ~ readFs, readIs, swapBuf + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-124 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidX = (tid & 15) << 2 +// tidY = tid >> 4 +--:-:-:-:1 LOP.AND tidX, tid, 15; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 4; + +// trackF += blkF*128 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 7; + +// trackI += blkI*64 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 6; + +// writeFs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidX, 7; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 6; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = ((tid & -16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = (tid >> 1) & 7 +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>]; +--:-:3:-:1 @P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>]; +--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; +--:-:5:-:1 @!P1 LDS.U.128 loadF4, [addr_zero]; + +--:-:4:-:1 @P1 LDG.E.128 loadI, [trackI]; +--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero]; + +12:-:-:-:1 STS.128 [writeFs + 4x<00>], loadF0; +04:-:-:-:1 STS.128 [writeFs + 4x<64>], loadF4; + +28:1:-:-:1 STS.128 [writeIs], loadI; + +[+ loop_setup() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>]; +--:5:3:-:1 @P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>]; +--:-:4:-:1 @P1 LDG.E.128 loadI, [trackI]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c40 => "02:2:-:-:1 \@P0 STS.128 [writeFs + 4x<00>], loadF0;\n", + + j2c10 => "02:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "20:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "10:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>];\n", + + j4c8 => "04:3:-:-:1 \@P0 STS.128 [writeFs + 4x<64>], loadF4;\n", + + j4c60 => "04:5:3:-:1 \@P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>];\n", + + j6c8 => "08:4:-:-:1 \@P0 STS.128 [writeIs], loadI;\n", + + j6c55 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j6c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j6c62 => "08:5:4:-:1 \@P1 LDG.E.128 loadI, [trackI];\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 2 +// tidOY = tid >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 SHR.U32 tidOY, tid, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 64 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 6; + +// Mul by 4 here expands k stride back out +// k = blkF*128 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 7; + +[+ output_setup(63, 0, 6) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X32_N128.sass b/Kernel/Convolution/Pascal/sconv_xprop_X32_N128.sass new file mode 100644 index 0000000..568e714 --- /dev/null +++ b/Kernel/Convolution/Pascal/sconv_xprop_X32_N128.sass @@ -0,0 +1,262 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 128; + our $shareF = 32; + our $stepI = 32; + our $stepF = 16; +-] + + + + + addr_zero : 4x<32*8*2 + 128*8*2 + 0> + szShareF : (32*8) + szShareI : (128*8) + + addr_zero : 4x<32*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<32*8*2 + 128*8*2 + 4> + addr_m : 4x<32*8*2 + 128*8*2 + 4> + addr_p : 4x<32*8*2 + 128*8*2 + 5> + addr_q : 4x<32*8*2 + 128*8*2 + 6> + addr_k : 4x<32*8*2 + 128*8*2 + 7> + addr_szLut : 4x<32*8*2 + 128*8*2 + 8> + addr_lut : 4x<32*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-69 : m, p, q + 64-69 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne + 70-113 ~ tid1, tid32, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 70-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-119 : loadI<00-15>, loadF<0-3> + + 120-121 : sliceI, sliceF + 120-121 : sliceIF<0-1> + + 122-140 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetF, offsetIc, offsetFc + 141-155 ~ readFs, readIs, swapBuf, tid, idx_N + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-140 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] + +[+ get_mpqk() +] + +// tidX = (tid & 7) << 2 +// tidY = tid >> 3 +--:-:-:-:1 LOP.AND tidX, tid, 7; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 3; + +// trackF += blkF*32 + tidX + offset_K +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 5; + +// trackI += blkI*128 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 7; + +// writeFs = (32*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidX, 5; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +// readFs = (((tid & 16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:0 SHL readFs, readFs, 4; + +// readIs = ((tid & 32) >> 1) | ((tid >> 1) & 7) << 4 +--:-:-:-:1 LOP.AND tid32, tid, 32; +--:-:-:-:1 SHR.U32 tid32, tid32, 1; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid32; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, 4x; + + + +[+ load_lut() +] + +--:-:1:-:1 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; + +--:-:2:-:1 @P1 LDG.E.128 loadI00, [trackI + 4x< 0>]; +--:-:3:-:1 @P1 LDG.E.128 loadI04, [trackI + 4x<32>]; +--:-:4:-:1 @P1 LDG.E.128 loadI08, [trackI + 4x<64>]; +--:-:5:-:1 @P1 LDG.E.128 loadI12, [trackI + 4x<96>]; +--:-:-:-:1 @!P1 LDS.U.128 loadI00, [addr_zero]; +--:-:-:-:1 @!P1 LDS.U.128 loadI04, [addr_zero]; +--:-:-:-:1 @!P1 LDS.U.128 loadI08, [addr_zero]; +--:-:6:-:2 @!P1 LDS.U.128 loadI12, [addr_zero]; + +21:-:-:-:1 STS.128 [writeFs], loadF0; + +02:-:-:-:1 STS.128 [writeIs + 4x< 0>], loadI00; +04:-:-:-:1 STS.128 [writeIs + 4x<32>], loadI04; +08:-:-:-:1 STS.128 [writeIs + 4x<64>], loadI08; +10:1:-:-:1 STS.128 [writeIs + 4x<96>], loadI12; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:-:-:1 @P1 LDG.E.128 loadI00, [trackI + 4x< 0>]; +--:-:3:-:1 @P1 LDG.E.128 loadI04, [trackI + 4x<32>]; +--:-:-:-:1 @P1 LDG.E.128 loadI08, [trackI + 4x<64>]; +--:5:4:-:1 @P1 LDG.E.128 loadI12, [trackI + 4x<96>]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c40 => "02:-:-:-:1 \@P0 STS.128 [writeFs], loadF0;\n", + + j1c62 => "--:-:2:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "02:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF];\n", + + j3c8 => "04:-:-:-:1 \@P0 STS.128 [writeIs + 4x< 0>], loadI00;\n", + j3c10 => "--:3:-:-:1 \@P0 STS.128 [writeIs + 4x<32>], loadI04;\n", + + j3c55 => "10:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j3c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j4c8 => "04:-:-:-:1 \@P1 LDG.E.128 loadI00, [trackI + 4x< 0>];\n", + j4c10 => "--:-:3:-:1 \@P1 LDG.E.128 loadI04, [trackI + 4x<32>];\n", + + j6c8 => "08:-:-:-:1 \@P0 STS.128 [writeIs + 4x<64>], loadI08;\n", + j6c10 => "--:4:-:-:1 \@P0 STS.128 [writeIs + 4x<96>], loadI12;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "08:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c8 => "--:-:-:-:1 \@P1 LDG.E.128 loadI08, [trackI + 4x<64>];\n", + j7c10 => "--:5:4:-:1 \@P1 LDG.E.128 loadI12, [trackI + 4x<96>];\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; + + + +// tidOX = (tid & 7) << 2 + (tid & 32) << 1 +// tidOY = (tid & 31) >> 3 +--:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 LOP.AND tidOX2, tid, 32; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 ISCADD tidOX, tidOX2, tidOX, 1; +--:-:-:-:1 LOP.AND tidOY, tid, 31; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +--:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*32 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +--:-:-:-:1 ISCADD k, idx_K, tidOY, 5; + +[+ output_setup(63, 1, 6) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X64_N128.sass b/Kernel/Convolution/Pascal/sconv_xprop_X64_N128.sass new file mode 100644 index 0000000..b782b8a --- /dev/null +++ b/Kernel/Convolution/Pascal/sconv_xprop_X64_N128.sass @@ -0,0 +1,253 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 128; + our $shareF = 64; + our $stepI = 64; + our $stepF = 32; +-] + + + + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + szShareF : (64*8) + szShareI : (128*8) + + addr_zero : 4x<64*8*2 + 128*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 128*8*2 + 4> + addr_m : 4x<64*8*2 + 128*8*2 + 4> + addr_p : 4x<64*8*2 + 128*8*2 + 5> + addr_q : 4x<64*8*2 + 128*8*2 + 6> + addr_k : 4x<64*8*2 + 128*8*2 + 7> + addr_szLut : 4x<64*8*2 + 128*8*2 + 8> + addr_lut : 4x<64*8*2 + 128*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-67 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-111 ~ tid1, tid64, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-111 : loadI<0-7>, loadF<0-3> + + 112-113 : sliceI, sliceF + 112-113 : sliceIF<0-1> + + 108-111 ~ offsetF, offsetIc, offsetFc + + 114-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI + 125-127 ~ readFs, readIs, swapBuf + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-124 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] +[+ get_mpqk() +] + +// tidX = (tid & 15) << 2 +// tidY = tid >> 4 +--:-:-:-:1 LOP.AND tidX, tid, 15; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 4; + +// trackF += blkF*64 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 6; + +// trackI += blkI*128 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 7; + +// writeFs = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidX, 6; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// readFs = ((tid & 48) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 48; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:1 SHL readFs, readFs, 4; + +// readIs = ((tid & 64) >> 3) | ((tid >> 1) & 7) +--:-:-:-:1 LOP.AND tid64, tid, 64; +--:-:-:-:1 SHR.U32 tid64, tid64, 3; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid64; +--:-:-:-:0 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:5:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x<00>]; +--:-:4:-:1 @P1 LDG.E.128 loadI4, [trackI + 4x<64>]; +--:-:-:-:1 @!P1 LDS.U.128 loadI0, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 loadI4, [addr_zero]; + +12:-:-:-:1 STS.128 [writeFs], loadF0; + +24:-:-:-:1 STS.128 [writeIs + 4x<00>], loadI0; +08:1:-:-:1 STS.128 [writeIs + 4x<64>], loadI4; + +[+ loop_setup() +] + +--:-:2:-:2 @P1 LDG.E.CI.128 loadF0, [trackF]; +--:-:3:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x<00>]; +--:5:4:-:1 @P1 LDG.E.128 loadI4, [trackI + 4x<64>]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c17 => "--:-:6:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j1c40 => "02:2:-:-:1 \@P0 STS.128 [writeFs], loadF0;\n", + + j2c10 => "02:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "20:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF];\n", + + j3c8 => "04:3:-:-:1 \@P0 STS.128 [writeIs + 4x<00>], loadI0;\n", + + j3c55 => "10:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j3c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j4c8 => "04:-:3:-:1 \@P1 LDG.E.128 loadI0, [trackI + 4x<00>];\n", + + j6c8 => "08:4:-:-:1 \@P0 STS.128 [writeIs + 4x<64>], loadI4;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "08:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c8 => "--:5:4:-:1 \@P1 LDG.E.128 loadI4, [trackI + 4x<64>];\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 2 + (tid & 64) >> 1 +// tidOY = (tid & 63) >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 LOP.AND tidOX2, tid, 64; +--:-:-:-:1 SHR.U32 tidOX2, tidOX2, 1; +--:-:-:-:1 LOP.OR tidOX, tidOX, tidOX2; +--:-:-:-:1 LOP.AND tidOY, tid, 63; +--:-:-:-:1 SHR.U32 tidOY, tidOY, 3; + +--:-:-:-:1 ISETP.GT.AND P2, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readIs, readIs, -4x; +--:-:-:-:1 @P2 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 @P2 IADD readIs, readIs, -swapBuf; + +// Div by 4 here collapses k stride +// writeCs = (readFs / 4) * 128 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 5; + +// readCs = 4 * (tidOX + (tidOY * 128)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 7; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*128 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 7; + +// Mul by 4 here expands k stride back out +// k = blkF*64 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 6; + +[+ output_setup(31, 1, 5) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X64_N64.sass b/Kernel/Convolution/Pascal/sconv_xprop_X64_N64.sass new file mode 100644 index 0000000..b42fbea --- /dev/null +++ b/Kernel/Convolution/Pascal/sconv_xprop_X64_N64.sass @@ -0,0 +1,240 @@ + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our $prefix = 's'; + our $shareI = 64; + our $shareF = 64; + our $stepI = 32; + our $stepF = 32; +-] + + + + + addr_zero : 4x<64*8*2 + 64*8*2 + 0> + szShareF : (64*8) + szShareI : (64*8) + + addr_zero : 4x<64*8*2 + 64*8*2 + 0> + addr_mpqk : 4x<64*8*2 + 64*8*2 + 4> + addr_m : 4x<64*8*2 + 64*8*2 + 4> + addr_p : 4x<64*8*2 + 64*8*2 + 5> + addr_q : 4x<64*8*2 + 64*8*2 + 6> + addr_k : 4x<64*8*2 + 64*8*2 + 7> + addr_szLut : 4x<64*8*2 + 64*8*2 + 8> + addr_lut : 4x<64*8*2 + 64*8*2 + 10> + +[+ params() +] + + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + + 64-67 : mpqk<0-3> + 64-66 : m, p, q + 64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne + 72-113 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2 + 72-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 96-99 : trackI<0-1>, trackF<0-1> + 100-115 : loadI<0-7>, loadF<0-7> + + 108-113 ~ offsetF, offsetIc, offsetFc + 114-115 : sliceI, sliceF + 114-115 : sliceIF<0-1> + + 116-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI + 126-127 ~ readFs, readIs + + 72-91 : cs<0-7>, c<0-3>, b<0-7> + 72-83 ~ x<0-7> + 92-99 : out<0-7> + 100-101 : Out<0-1> + 102-103 : Sum<0-1> + 104-125 ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, preds, one + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; + +[+ load_zeros() +] + +[+ get_mpqk() +] + +// tidX = (tid & 7) << 2 +// tidY = tid >> 3 +--:-:-:-:1 LOP.AND tidX, tid, 7; +--:-:-:-:1 SHL tidX, tidX, 2; +--:-:-:-:1 SHR.U32 tidY, tid, 3; + +// trackF += blkF*64 + tidX +--:-:-:-:1 ISCADD offsetFk, idx_K, tidX, 6; + +// trackI += blkI*64 + tidX +08:-:-:-:1 ISCADD offsetIn, idx_N, tidX, 6; + +// writeS = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 6; +--:-:-:-:1 SHL writeS, writeS, 2; + +// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, -16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; +--:-:-:-:0 SHL readFs, readFs, 4; + +// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>; +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readIs, readIs, 4x, 4; + + +[+ load_lut() +] + +--:-:1:-:1 @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>]; +--:-:2:-:1 @P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>]; +--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero]; +--:-:5:-:2 @!P1 LDS.U.128 loadF4, [addr_zero]; + +--:-:3:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x< 0>]; +--:-:4:-:1 @P1 LDG.E.128 loadI4, [trackI + 4x<32>]; +--:-:-:-:1 @!P1 LDS.U.128 loadI0, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 loadI4, [addr_zero]; + +11:-:-:-:1 STS.128 [writeS + 4x<0*64 + 0>], loadF0; +02:-:-:-:1 STS.128 [writeS + 4x<0*64 + 32>], loadF4; + +24:-:-:-:1 STS.128 [writeS + 4x<8*64 + 0>], loadI0; +08:1:-:-:1 STS.128 [writeS + 4x<8*64 + 32>], loadI4; + +[+ loop_setup() +] + +--:-:2:-:1 @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>]; +--:-:3:-:1 @P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>]; +--:-:4:-:1 @P1 LDG.E.128 loadI0, [trackI + 4x< 0>]; +--:-:5:-:1 @P1 LDG.E.128 loadI4, [trackI + 4x<32>]; + +[- + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P1, PT, posCRST, RZ, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n", + + j0c13 => "--:-:6:-:1 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n", + + j0c39 => "20:-:-:-:1 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n", + j0c44 => "--:-:-:-:1 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c46 => "--:-:6:-:1 \@P1 F2I.S32.F32.TRUNC channel, channel;\n", + + j1c8 => "20:-:-:-:1 \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n", + j1c13 => "--:-:-:-:1 \@P1 SHL lutOffset, lutOffset, 3;\n", + + j1c37 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*64 + 0>], loadF0;\n", + j1c39 => "04:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*64 + 32>], loadF4;\n", + + j1c62 => "02:-:2:-:1 \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n", + + j2c10 => "--:-:-:-:1 \@P1 XMAD offsetFc, channel, param_KRST, RZ;\n", + j2c15 => "--:-:-:-:1 \@P1 XMAD offsetIc, channel, param_DHWN, RZ;\n", + j2c20 => "--:-:-:-:1 \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n", + j2c22 => "--:-:-:-:1 IADD posCRST, posCRST, -8;\n", + + j2c29 => "02:-:-:-:1 \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n", + j2c34 => "--:-:-:-:1 \@P1 LEA trackF0.CC, offsetF, param_F[0], 2;\n", + j2c36 => "--:-:-:-:1 \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n", + j2c38 => "--:-:-:-:1 \@P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, 2;\n", + + j2c40 => "--:-:2:-:1 \@P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];\n", + j2c42 => "--:-:3:-:1 \@P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>];\n", + + j6c8 => "08:-:-:-:1 \@P0 STS.128 [writeS + 4x<8*64 + 0>], loadI0;\n", + j6c10 => "10:4:-:-:1 \@P0 STS.128 [writeS + 4x<8*64 + 32>], loadI4;\n", + + j6c55 => "--:-:-:-:1 \@P1 LEA trackI0.CC, offsetI, param_I[0], 2;\n", + j6c60 => "--:-:-:-:1 \@P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, 2;\n", + + j6c62 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "08:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n", + + j7c8 => "--:-:4:-:1 \@P1 LDG.E.128 loadI0, [trackI + 4x< 0>];\n", + j7c10 => "--:-:5:-:1 \@P1 LDG.E.128 loadI4, [trackI + 4x<32>];\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); +-] + +LOOP: + +[+ main_loop() +] + +--:-:1:-:1 LDS.U.128 mpqk, [addr_mpqk]; +--:-:2:-:1 S2R tid, SR_TID.X; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +// tidOX = (tid & 7) << 2 +// tidOY = tid >> 3 +02:-:-:-:1 LOP.AND tidOX, tid, 7; +--:-:-:-:1 SHL tidOX, tidOX, 2; +--:-:-:-:1 SHR.U32 tidOY, tid, 3; + +--:-:-:-:1 LOP.AND readIs, readIs, 0x7ff; +--:-:-:-:1 LOP.AND readFs, readFs, 0x7ff; + +// Div by 4 here collapses k stride +// writeCs = (readKs / 4) * 64 + readNs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 4; + +// readCs = 4 * (tidOX + (tidOY * 64)) +--:-:-:-:1 ISCADD readCs, tidOY, tidOX, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = blkI*64 + tidOX; +04:-:-:-:1 ISCADD n, idx_N, tidOX, 6; + +// Mul by 4 here expands k stride back out +// k = blkF*64 + tidOY * 4 +--:-:-:-:1 SHL tidOY, tidOY, 2; +01:-:-:-:1 ISCADD k, idx_K, tidOY, 6; + +[+ output_setup(63, 0, 6) +] + + + +[+ output() +] \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/xconv_direct_updat_64x32.sass b/Kernel/Convolution/Pascal/xconv_direct_updat_64x32.sass new file mode 100644 index 0000000..803487e --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_direct_updat_64x32.sass @@ -0,0 +1,1077 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our ($type, $SN, $D); +our $determ = $D; +our $largeN = !$SN; +our $dtype = $type eq 'h' ? '.U16' : ''; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $vec_size = $type eq 'h' ? '64' : '128'; +our $dtype_shift = $type eq 'h' ? '1' : '2'; +our $dtype_size = $type eq 'h' ? '2' : '4'; +sub dtype { return $dtype; } +sub dtype_shift { return $dtype_shift; } +sub vec_size { return $vec_size; } +sub output_op { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; } +-] + + + + addr_zero : 4x<(32 + 64)*33*2> + szShareI : (64*33) + szShareE : (32*33) + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_C : c[0x0][0x15c] + param_D : c[0x0][0x160] + param_H : c[0x0][0x164] + param_W : c[0x0][0x168] + param_N : c[0x0][0x16c] + param_K : c[0x0][0x170] + param_M : c[0x0][0x174] + param_P : c[0x0][0x178] + param_Q : c[0x0][0x17c] + param_str_d : c[0x0][0x180] + param_str_h : c[0x0][0x184] + param_str_w : c[0x0][0x188] + param_pad_d : c[0x0][0x18c] + param_pad_h : c[0x0][0x190] + param_pad_w : c[0x0][0x194] + param_dil_d : c[0x0][0x198] + param_dil_h : c[0x0][0x19c] + param_dil_w : c[0x0][0x1a0] + param_DHWN : c[0x0][0x1a4] + param_HWN : c[0x0][0x1a8] + param_WN : c[0x0][0x1ac] + param_MPQN16p : c[0x0][0x1b0] + param_MPQN : c[0x0][0x1b4] + param_PQN : c[0x0][0x1b8] + param_QN : c[0x0][0x1bc] + param_PQkc : c[0x0][0x1c0] + param_Qkc : c[0x0][0x1c4] + param_kc : c[0x0][0x1c8] + param_c : c[0x0][0x1cc] + param_k : c[0x0][0x1d0] + param_magic_PQkc : c[0x0][0x1d4] + param_shift_PQkc : c[0x0][0x1d8] + param_magic_Qkc : c[0x0][0x1dc] + param_shift_Qkc : c[0x0][0x1e0] + param_magic_kc : c[0x0][0x1e4] + param_shift_kc : c[0x0][0x1e8] + param_magic_c : c[0x0][0x1ec] + param_shift_c : c[0x0][0x1f0] + param_CTRSK : c[0x0][0x1f4] + param_CTRS : c[0x0][0x1f8] + param_TRS : c[0x0][0x1fc] + param_RS : c[0x0][0x200] + param_S : c[0x0][0x204] + param_magic_TRS : c[0x0][0x208] + param_shift_TRS : c[0x0][0x20c] + param_magic_RS : c[0x0][0x210] + param_shift_RS : c[0x0][0x214] + param_magic_S : c[0x0][0x218] + param_shift_S : c[0x0][0x21c] + param_superM : c[0x0][0x220] + param_superP : c[0x0][0x224] + param_superQ : c[0x0][0x228] + param_superN : c[0x0][0x22c] + param_shiftM : c[0x0][0x230] + param_shiftP : c[0x0][0x234] + param_shiftQ : c[0x0][0x238] + param_strideP : c[0x0][0x23c] + param_strideQ : c[0x0][0x240] + param_stridePQ : c[0x0][0x244] + param_gridP : c[0x0][0x248] + param_gridQ : c[0x0][0x24c] + param_loopX : c[0x0][0x250] + param_loopXp : c[0x0][0x254] + param_loopQ : c[0x0][0x258] + param_loopQp : c[0x0][0x25c] + param_loopN : c[0x0][0x260] + param_loopNp : c[0x0][0x264] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 96-119 : I0<0-3>, I1<0-3>, I2<0-3>, I3<0-3>, E0<0-3>, E1<0-3> + 120-131 : track0I<0-1>, track1I<0-1>, track2I<0-1>, track3I<0-1>, track0E<0-1>, track1E<0-1> + + 64-131 ~ tid, idx_MPQkc, idx_PQkc, idx_Qkc, idx_kc, idx_k, idx_c, magic_PQkc, magic_Qkc, neg_PQkc, neg_Qkc, neg_kc, neg_c, div1, div2, div3, tidX, tidX4, tidY, tid1, readEs2, tid32, tid32_2, neg_TRS, neg_RS, neg_S, super_m, m, mt, k, k16, ctrs<0-3>, trs<0-3>, rs<0-3>, c<0-3>, t<0-3>, z<0-3> + + 80-81 : super_p, super_q + 80-81 : pr, qs + 82-95 ~ p, te, pIn, qIn, predEt, ti<0-3>, y<0-3> + 80-95 ~ loopN, N + + 132-167 ~ tid7, q, n, idx_K, idx_C, idx_M, idx_P, start_P, idx_Q, start_Q, writeIs, writeEs, readIs, readEs, swapBuf, writeFs, predI, predE, init, x<0-3>, czOffset<0-3>, r<0-3>, s<0-3>, kmOffset + + 96-103 : track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1> + 104-119 ~ f00_<0-3>, f04_<0-3>, f08_<0-3>, f12_<0-3> + 104-119 ~ Tid, tid_31, tid_32, K, K16, tf, idx_MPQ, xmad_determ + 120-131 ~ alpha, readFs, K1, kk, crst<00|04|08|12> + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQkc, SR_CTAID.X; +--:-:3:-:1 S2R idx_C, SR_CTAID.Y; +--:-:4:-:1 S2R idx_K, SR_CTAID.Z; + + + +--:-:-:-:1 STS.128 [addr_zero], RZ; + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// idx_M = idx_MPQkc / blk_PQkc +--:-:-:-:1 MOV magic_PQkc, param_magic_PQkc; +--:-:-:-:1 ISETP.NE.AND P0, PT, magic_PQkc, 1, PT; +02:-:-:-:1 @P0 XMAD div1, idx_MPQkc, magic_PQkc, RZ; +--:-:-:-:1 @P0 XMAD div2, idx_MPQkc, magic_PQkc.H1, RZ; +--:-:-:-:1 @P0 XMAD div3, idx_MPQkc.H1, magic_PQkc.H1, RZ; +--:-:-:-:1 @P0 XMAD.CHI div1, idx_MPQkc.H1, magic_PQkc, div1; +--:-:-:-:1 @P0 IADD3.RS idx_M, div1, div2, div3; +--:-:-:-:1 @P0 SHR.U32 idx_M, idx_M, param_shift_PQkc; +--:-:-:-:1 @!P0 SHR.U32 idx_M, idx_MPQkc, param_shift_PQkc; + +// idx_PQkc = idx_PQkc % blk_Qkc +--:-:-:-:1 IADD neg_PQkc, RZ, -param_PQkc; +--:-:-:-:1 XMAD.LO2 idx_PQkc, neg_PQkc, idx_M, idx_MPQkc; + +// idx_P = idx_PQkc / blk_Qkc +--:-:-:-:1 MOV magic_Qkc, param_magic_Qkc; +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_Qkc, 1, PT; +--:-:-:-:1 @P1 XMAD div1, idx_PQkc, magic_Qkc, RZ; +--:-:-:-:1 @P1 XMAD div2, idx_PQkc, magic_Qkc.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, idx_PQkc.H1, magic_Qkc.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, idx_PQkc.H1, magic_Qkc, div1; +--:-:-:-:1 @P1 IADD3.RS idx_P, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 idx_P, idx_P, param_shift_Qkc; +--:-:-:-:1 @!P1 SHR.U32 idx_P, idx_PQkc, param_shift_Qkc; + +// idx_Qkc = idx_PQkc % blk_Qkc +--:-:-:-:1 IADD neg_Qkc, RZ, -param_Qkc; +--:-:-:-:1 XMAD.LO2 idx_Qkc, neg_Qkc, idx_P, idx_PQkc; + +// idx_Q = idx_Qkc / kc +--:-:-:-:1 XMAD.LO2C idx_Q, idx_Qkc, param_magic_kc, RZ; +--:-:-:-:1 SHR.U32 idx_Q, idx_Q, param_shift_kc; +// idx_kc = idx_Qkc % kc +--:-:-:-:1 IADD neg_kc, RZ, -param_kc; +--:-:-:-:1 XMAD.S16.U16 idx_kc, neg_kc, idx_Q, idx_Qkc; + +// idx_k = idx_kc / c +--:-:-:-:1 XMAD idx_k, idx_kc, param_magic_c, RZ; +--:-:-:-:1 SHR.U32 idx_k, idx_k, param_shift_c; +// idx_c = idx_kc % c +--:-:-:-:1 IADD neg_c, RZ, -param_c; +--:-:-:-:1 XMAD.S16.U16 idx_c, neg_c, idx_k, idx_kc; + +// idx_C = idx_C * blk_c + idx_c +// idx_K = idx_K * blk_k + idx_k +04:-:-:-:1 XMAD idx_C, idx_C, param_c, idx_c; +08:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +--:-:-:-:1 MOV start_P, idx_P; +--:-:-:-:1 MOV start_Q, idx_Q; + +// tidX = tid >> 3 +// tidY = (tid & 7) << 2 +// shiftX = tidY +01:-:-:-:1 SHR.U32 tidX, tid, 3; +--:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHL tidY, tid7, 2; + +// writeIs = (tidY*64 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 6; +--:-:-:-:1 IADD writeIs, writeIs, tidY; +--:-:-:-:1 SHL writeIs, writeIs, 2; + +// writeEs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeEs, tidY, tidX, 5; +--:-:-:-:1 IADD writeEs, writeEs, tidY; +--:-:-:-:1 ISCADD writeEs, writeEs, 4x, 2; + +// readEs = (((tid >> 1) & 3) << 4 +--:-:-:-:1 BFE.U32 readEs, tid, 0x201; // 2 bits at position 1 + +// readIs = (((tid & 24) >> 2) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readIs, tid, 24; +--:-:-:-:1 SHR.U32 readIs, readIs, 2; +--:-:-:-:1 LOP.OR readIs, readIs, tid1; + +// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5) +// tid32 = tid & -32 +--:-:-:-:1 LOP.AND tid32, tid, -32; + +// readEs2 = readEs + (tid32 >> 2) + (readIs << 2) +--:-:-:-:1 SHR.U32 tid32_2, tid32, 2; +--:-:-:-:1 IADD readEs2, tid32_2, readEs; +--:-:-:-:1 ISCADD readEs2, readIs, readEs2, 2; + +--:-:-:-:1 SHL readIs, readIs, 4; +--:-:-:-:1 SHL readEs, readEs, 4; +--:-:-:-:1 SHL readEs2, readEs2, 4; + +// writeFs = readIs*32*4 + readEs2 +--:-:-:-:1 ISCADD writeFs, readIs, readEs2, 7; + +// Each block of 32 threads works on 8 lines, +// Also shift over each 8 lines by 8 (cumulative) +// readIs += tid32/4 * 64 * 4 + tid32/4 * 4 +// readEs += tid32/4 * 32 * 4 + tid32/4 * 4 + 4x +--:-:-:-:1 ISCADD readIs, tid32, readIs, 6; +--:-:-:-:1 ISCADD readEs, tid32, readEs, 5; +--:-:-:-:1 IADD readIs, readIs, tid32; +--:-:-:-:1 IADD3 readEs, readEs, 4x, tid32; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// Remap ctrs for better L1 cache performance with small N +// Maximize the amount of overlapping data requested within a warp. +// The L1 is partitioned in to 2 groups of 2 warps. +// ctrs = idx_C*64 + tidX*4 +--:-:-:-:1 SHL tidX4, tidX, 2; +--:-:-:-:1 ISCADD ctrs0, idx_C, tidX4, 6; +--:-:-:-:1 IADD ctrs1, ctrs0, 1; +--:-:-:-:1 IADD ctrs2, ctrs0, 2; +--:-:-:-:1 IADD ctrs3, ctrs0, 3; + +// c = ctrs / RST +--:-:-:-:1 XMAD.LO2C c0, ctrs0, param_magic_TRS, RZ; +--:-:-:-:1 XMAD.LO2C c1, ctrs1, param_magic_TRS, RZ; +--:-:-:-:1 XMAD.LO2C c2, ctrs2, param_magic_TRS, RZ; +--:-:-:-:1 XMAD.LO2C c3, ctrs3, param_magic_TRS, RZ; +--:-:-:-:1 SHR.U32 c0, c0, param_shift_TRS; +--:-:-:-:1 SHR.U32 c1, c1, param_shift_TRS; +--:-:-:-:1 SHR.U32 c2, c2, param_shift_TRS; +--:-:-:-:1 SHR.U32 c3, c3, param_shift_TRS; +// trs = ctrs % RST +--:-:-:-:1 IADD neg_TRS, RZ, -param_TRS; +--:-:-:-:1 XMAD.S16.U16 trs0, neg_TRS, c0, ctrs0; +--:-:-:-:1 XMAD.S16.U16 trs1, neg_TRS, c1, ctrs1; +--:-:-:-:1 XMAD.S16.U16 trs2, neg_TRS, c2, ctrs2; +--:-:-:-:1 XMAD.S16.U16 trs3, neg_TRS, c3, ctrs3; + +// t = trs / RS +--:-:-:-:1 XMAD t0, trs0, param_magic_RS, RZ; +--:-:-:-:1 XMAD t1, trs1, param_magic_RS, RZ; +--:-:-:-:1 XMAD t2, trs2, param_magic_RS, RZ; +--:-:-:-:1 XMAD t3, trs3, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t0, t0, param_shift_RS; +--:-:-:-:1 SHR.U32 t1, t1, param_shift_RS; +--:-:-:-:1 SHR.U32 t2, t2, param_shift_RS; +--:-:-:-:1 SHR.U32 t3, t3, param_shift_RS; +// rs = trs % RS +--:-:-:-:1 IADD neg_RS, RZ, -param_RS; +--:-:-:-:1 XMAD.S16.U16 rs0, neg_RS, t0, trs0; +--:-:-:-:1 XMAD.S16.U16 rs1, neg_RS, t1, trs1; +--:-:-:-:1 XMAD.S16.U16 rs2, neg_RS, t2, trs2; +--:-:-:-:1 XMAD.S16.U16 rs3, neg_RS, t3, trs3; + +// r = rs / S +--:-:-:-:1 XMAD r0, rs0, param_magic_S, RZ; +--:-:-:-:1 XMAD r1, rs1, param_magic_S, RZ; +--:-:-:-:1 XMAD r2, rs2, param_magic_S, RZ; +--:-:-:-:1 XMAD r3, rs3, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r0, r0, param_shift_S; +--:-:-:-:1 SHR.U32 r1, r1, param_shift_S; +--:-:-:-:1 SHR.U32 r2, r2, param_shift_S; +--:-:-:-:1 SHR.U32 r3, r3, param_shift_S; +// s = rs % S +--:-:-:-:1 IADD neg_S, RZ, -param_S; +--:-:-:-:1 XMAD.S16.U16 s0, neg_S, r0, rs0; +--:-:-:-:1 XMAD.S16.U16 s1, neg_S, r1, rs1; +--:-:-:-:1 XMAD.S16.U16 s2, neg_S, r2, rs2; +--:-:-:-:1 XMAD.S16.U16 s3, neg_S, r3, rs3; + +--:-:-:-:1 LOP.AND n, tid, param_superN; +--:-:-:-:1 SHL n, n, 2; + +// M,C,K are static coords so compute offsets and predicates once +--:-:-:-:1 SHL m, idx_M, param_shiftM; +--:-:-:-:1 BFE.U32 super_m, tid7, param_superM; +--:-:-:-:1 IADD m, m, super_m; + +// z = m * str_d - pad_d + (t * dil_d) +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; + +--:-:-:-:1 XMAD z0, t0, param_dil_d, mt; +--:-:-:-:1 XMAD z1, t1, param_dil_d, mt; +--:-:-:-:1 XMAD z2, t2, param_dil_d, mt; +--:-:-:-:1 XMAD z3, t3, param_dil_d, mt; +--:-:-:-:1 IADD z0, z0, -param_pad_d; +--:-:-:-:1 IADD z1, z1, -param_pad_d; +--:-:-:-:1 IADD z2, z2, -param_pad_d; +--:-:-:-:1 IADD z3, z3, -param_pad_d; + +// czOffset = c*DHWN + z*HWN +--:-:-:-:1 XMAD.LO2C czOffset0, c0, param_DHWN, RZ; +--:-:-:-:1 XMAD.LO2C czOffset1, c1, param_DHWN, RZ; +--:-:-:-:1 XMAD.LO2C czOffset2, c2, param_DHWN, RZ; +--:-:-:-:1 XMAD.LO2C czOffset3, c3, param_DHWN, RZ; +--:-:-:-:1 XMAD.S16.U16.LO2C czOffset0, z0, param_HWN, czOffset0; +--:-:-:-:1 XMAD.S16.U16.LO2C czOffset1, z1, param_HWN, czOffset1; +--:-:-:-:1 XMAD.S16.U16.LO2C czOffset2, z2, param_HWN, czOffset2; +--:-:-:-:1 XMAD.S16.U16.LO2C czOffset3, z3, param_HWN, czOffset3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, c0, param_C, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, c1, param_C, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, c2, param_C, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, c3, param_C, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, z0, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_D, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_D, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_D, P3; +--:-:-:-:1 ISETP.GE.AND P0, PT, z0, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; +--:-:-:-:1 P2R predI, PR, RZ, 0x0f; +--:-:-:-:1 SHL predI, predI, 8; + +// k = idx_K*32 + tidX +--:-:-:-:1 ISCADD k, idx_K, tidX, 5; + +// kmOffset = k*MPQN + m*PQN +--:-:-:-:1 XMAD.LO2C kmOffset, k, param_MPQN, RZ; +--:-:-:-:1 XMAD.LO2C kmOffset, m, param_PQN, kmOffset; + +--:-:-:-:1 IADD k16, k, 16; +--:-:-:-:1 ISETP.LT.AND P4, PT, m, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, k16, param_K, P4; +--:-:-:-:1 P2R predE, PR, RZ, 0x03; +--:-:-:-:1 SHL predE, predE, 2; + + + +--:-:-:-:5 CAL CALC_OFFSETS; +--:-:-:-:5 CAL DO_LOADS; +--:-:-:-:5 CAL CALC_OFFSETS; + +[+ + our $convert_in; + return $convert_in ? qq{ +02:-:-:-:1 $convert_in I03, I01.H1; +--:-:-:-:1 $convert_in I02, I01.H0; +--:-:-:-:1 $convert_in I01, I00.H1; +--:-:-:-:1 $convert_in I00, I00.H0; + +--:-:-:-:1 $convert_in I13, I11.H1; +--:-:-:-:1 $convert_in I12, I11.H0; +--:-:-:-:1 $convert_in I11, I10.H1; +--:-:2:-:1 $convert_in I10, I10.H0; + +04:-:-:-:1 $convert_in I23, I21.H1; +--:-:-:-:1 $convert_in I22, I21.H0; +--:-:-:-:1 $convert_in I21, I20.H1; +--:-:-:-:1 $convert_in I20, I20.H0; + +--:-:-:-:1 $convert_in I33, I31.H1; +--:-:-:-:1 $convert_in I32, I31.H0; +--:-:-:-:1 $convert_in I31, I30.H1; +--:-:3:-:1 $convert_in I30, I30.H0; + +08:-:-:-:1 $convert_in E03, E01.H1; +--:-:-:-:1 $convert_in E02, E01.H0; +--:-:-:-:1 $convert_in E01, E00.H1; +--:-:4:-:1 $convert_in E00, E00.H0; + +10:-:-:-:1 $convert_in E13, E11.H1; +--:-:-:-:1 $convert_in E12, E11.H0; +--:-:-:-:1 $convert_in E11, E10.H1; +--:-:5:-:1 $convert_in E10, E10.H0; + } : ''; ++] + +02:-:-:-:1 STS [writeIs + 4x<0*64 + 0*16>], I00; +--:-:-:-:1 STS [writeIs + 4x<1*64 + 0*16>], I01; +--:-:-:-:1 STS [writeIs + 4x<2*64 + 0*16>], I02; +--:-:-:-:1 STS [writeIs + 4x<3*64 + 0*16>], I03; + +--:-:-:-:1 STS [writeIs + 4x<0*64 + 1*16>], I10; +--:-:-:-:1 STS [writeIs + 4x<1*64 + 1*16>], I11; +--:-:-:-:1 STS [writeIs + 4x<2*64 + 1*16>], I12; +--:-:-:-:1 STS [writeIs + 4x<3*64 + 1*16>], I13; + +04:-:-:-:1 STS [writeIs + 4x<0*64 + 2*16>], I20; +--:-:-:-:1 STS [writeIs + 4x<1*64 + 2*16>], I21; +--:-:-:-:1 STS [writeIs + 4x<2*64 + 2*16>], I22; +--:-:-:-:1 STS [writeIs + 4x<3*64 + 2*16>], I23; + +--:-:-:-:1 STS [writeIs + 4x<0*64 + 3*16>], I30; +--:-:-:-:1 STS [writeIs + 4x<1*64 + 3*16>], I31; +--:-:-:-:1 STS [writeIs + 4x<2*64 + 3*16>], I32; +--:-:-:-:1 STS [writeIs + 4x<3*64 + 3*16>], I33; + +08:-:-:-:1 STS [writeEs + 4x<0*32 + 0*16>], E00; +--:-:-:-:1 STS [writeEs + 4x<1*32 + 0*16>], E01; +--:-:-:-:1 STS [writeEs + 4x<2*32 + 0*16>], E02; +--:-:-:-:1 STS [writeEs + 4x<3*32 + 0*16>], E03; + +10:-:-:-:1 STS [writeEs + 4x<0*32 + 1*16>], E10; +--:-:-:-:1 STS [writeEs + 4x<1*32 + 1*16>], E11; +--:-:-:-:1 STS [writeEs + 4x<2*32 + 1*16>], E12; +--:-:-:-:1 STS [writeEs + 4x<3*32 + 1*16>], E13; + +// init = bNextY ? 1 : 0 +--:-:-:-:0 SEL init, RZ, 1, !P6; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD writeEs, writeEs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy4, [readIs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*32 + 16>]; + +--:-:-:-:5 CAL DO_LOADS; + +// init += bNextY ? 1 : 0 +--:-:-:-:0 @P6 IADD init, init, 1; + +--:-:-:-:5 CAL CALC_OFFSETS; +--:-:-:-:5 BRA.U MAIN_LOOP; + +DO_LOADS: + + + +--:-:-:-:1 R2P PR, predI, 0x0f; +--:-:2:-:1 @P0 LDG.E.CI.[+ vec_size() +] I0, [track0I]; +--:-:2:-:1 @P1 LDG.E.CI.[+ vec_size() +] I1, [track1I]; +--:-:3:-:1 @P2 LDG.E.CI.[+ vec_size() +] I2, [track2I]; +--:-:3:-:1 @P3 LDG.E.CI.[+ vec_size() +] I3, [track3I]; +--:-:-:-:1 @!P0 LDS.U.[+ vec_size() +] I0, [addr_zero]; +--:-:-:-:1 @!P1 LDS.U.[+ vec_size() +] I1, [addr_zero]; +--:-:-:-:1 @!P2 LDS.U.[+ vec_size() +] I2, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.[+ vec_size() +] I3, [addr_zero]; + +--:-:-:-:1 R2P PR, predE, 0x03; +--:-:4:-:1 @P0 LDG.E.CI.[+ vec_size() +] E0, [track0E]; +--:6:5:-:1 @P1 LDG.E.CI.[+ vec_size() +] E1, [track1E]; +--:-:-:-:1 @!P0 LDS.U.[+ vec_size() +] E0, [addr_zero]; +--:-:2:-:1 @!P1 LDS.U.[+ vec_size() +] E1, [addr_zero]; + + +// Advance offset/preds +--:-:-:-:1 IADD n, n, param_loopN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +--:-:-:-:1 @!P4 LOP.AND n, tid7, param_superN; +--:-:-:-:1 @!P4 SHL n, n, 2; +--:-:-:-:1 @!P4 IADD idx_Q, idx_Q, param_strideQ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, idx_Q, param_gridQ, PT; + +--:-:-:-:1 @!P5 MOV idx_Q, start_Q; +--:-:-:-:1 @!P5 IADD idx_P, idx_P, param_strideP; + +--:-:-:-:1 ISETP.LT.AND P6, PT, idx_P, param_gridP, PT; +--:-:-:-:0 ISETP.LT.AND P5, PT, idx_Q, param_gridQ, P6; + +--:-:-:-:1 @!P6 MOV predI, RZ; +--:-:-:-:1 @!P6 MOV predE, RZ; + + +--:-:-:-:5 RET; + +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; + +CALC_OFFSETS: + + +// Calc superblock coordinates in m,p,q space +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; + +// Calc this thread's offset within the superblock +--:-:-:-:1 BFE.U32 super_p, tid7, param_superP; +--:-:-:-:1 BFE.U32 super_q, tid7, param_superQ; + +// Combine offsets for final m,p,q coordinate +--:-:-:-:1 IADD p, p, super_p; +--:-:-:-:1 IADD q, q, super_q; + +// y = p * str_h - pad_h + (r * dil_h) +// x = q * str_w - pad_w + (s * dil_w) +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, q, param_str_w, RZ; + +--:-:-:-:1 XMAD y0, r0, param_dil_h, pr; +--:-:-:-:1 XMAD y1, r1, param_dil_h, pr; +--:-:-:-:1 XMAD y2, r2, param_dil_h, pr; +--:-:-:-:1 XMAD y3, r3, param_dil_h, pr; +--:-:-:-:1 IADD y0, y0, -param_pad_h; +--:-:-:-:1 IADD y1, y1, -param_pad_h; +--:-:-:-:1 IADD y2, y2, -param_pad_h; +--:-:-:-:1 IADD y3, y3, -param_pad_h; + +--:-:-:-:1 XMAD x0, s0, param_dil_w, qs; +--:-:-:-:1 XMAD x1, s1, param_dil_w, qs; +--:-:-:-:1 XMAD x2, s2, param_dil_w, qs; +--:-:-:-:1 XMAD x3, s3, param_dil_w, qs; +--:-:-:-:1 IADD x0, x0, -param_pad_w; +--:-:-:-:1 IADD x1, x1, -param_pad_w; +--:-:-:-:1 IADD x2, x2, -param_pad_w; +--:-:-:-:1 IADD x3, x3, -param_pad_w; + +// trackI = c*DHWN + z*HWN + y*WN + x*N + n +--:-:-:-:1 XMAD.S16.U16.LO2C ti0, y0, param_WN, n; +--:-:-:-:1 XMAD.S16.U16.LO2C ti1, y1, param_WN, n; +--:-:-:-:1 XMAD.S16.U16.LO2C ti2, y2, param_WN, n; +--:-:-:-:1 XMAD.S16.U16.LO2C ti3, y3, param_WN, n; +--:-:-:-:1 XMAD.S16.U16 ti0, x0, param_N, ti0; +--:-:-:-:1 XMAD.S16.U16 ti1, x1, param_N, ti1; +--:-:-:-:1 XMAD.S16.U16 ti2, x2, param_N, ti2; +--:-:-:-:1 XMAD.S16.U16 ti3, x3, param_N, ti3; +--:-:-:-:1 IADD ti0, ti0, czOffset0; +--:-:-:-:1 IADD ti1, ti1, czOffset1; +--:-:-:-:1 IADD ti2, ti2, czOffset2; +--:-:-:-:1 IADD ti3, ti3, czOffset3; + +20:-:-:-:1 LEA track0I0.CC, ti0, param_I[0], [+ dtype_shift() +]; +--:-:-:-:1 ISET.LT.AND ti0, ti0, RZ, PT; +--:-:-:-:1 IADD.X track0I1, ti0, param_I[1]; +--:-:-:-:1 LEA track1I0.CC, ti1, param_I[0], [+ dtype_shift() +]; +--:-:-:-:1 ISET.LT.AND ti1, ti1, RZ, PT; +--:-:-:-:1 IADD.X track1I1, ti1, param_I[1]; +--:-:-:-:1 LEA track2I0.CC, ti2, param_I[0], [+ dtype_shift() +]; +--:-:-:-:1 ISET.LT.AND ti2, ti2, RZ, PT; +--:-:-:-:1 IADD.X track2I1, ti2, param_I[1]; +--:-:-:-:1 LEA track3I0.CC, ti3, param_I[0], [+ dtype_shift() +]; +--:-:-:-:1 ISET.LT.AND ti3, ti3, RZ, PT; +--:-:-:-:1 IADD.X track3I1, ti3, param_I[1]; + +--:-:-:-:1 SHR.U32 predI, predI, 8; +--:-:-:-:1 R2P PR, predI, 0x0f; +--:-:-:-:1 SHL predI, predI, 4; + +--:-:-:-:1 ISETP.LT.AND P0, PT, y0, param_H, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_H, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_H, P3; +--:-:-:-:1 ISETP.GE.AND P0, PT, y0, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, y1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, y2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, y3, RZ, P3; +--:-:-:-:1 P2R predI, PR, predI, 0x0f; +--:-:-:-:1 SHL predI, predI, 4; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x0, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, P3; +--:-:-:-:1 ISETP.GE.AND P0, PT, x0, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R predI, PR, predI, 0x0f; + +// trackE = k*MPQN + m*PQN + p*QN + n +--:-:-:-:1 XMAD.LO2C te, p, param_QN, n; +--:-:-:-:1 XMAD te, q, param_N, te; +--:-:-:-:1 IADD te, te, kmOffset; + +--:-:-:-:1 LEA track0E0.CC, te, param_E[0], [+ dtype_shift() +]; +--:-:-:-:1 LEA.HI.X track0E1, te, param_E[1], RZ, [+ dtype_shift() +]; +--:-:-:-:1 IADD track1E0.CC, track0E0, param_MPQN16p; +--:-:-:-:0 IADD.X track1E1, track0E1, RZ; + +--:-:-:-:1 ISET.LT.AND qIn, p, param_P, PT; +--:-:-:-:1 ISET.LT.AND pIn, q, param_Q, PT; +--:-:-:-:1 SHR.U32 predEt, predE, 2; +--:-:-:-:1 LOP3.LUT predEt, predEt, pIn, qIn, 0x80; +--:-:-:-:1 BFI predE, predEt, 0x200, predE; + + +--:-:-:-:5 RET; + + +MAIN_LOOP: +[+ + our ($vec_size, $convert_in, $largeN); + my %insert = ( + + j0c8 => "--:-:-:-:1 R2P PR, predI, 0x0f;\n", + + $convert_in ? ( + j1c5 => "--:-:-:-:1 DEPBAR.LE SB1, 1;\n", + j1c8 => "--:-:-:-:1 $convert_in I03, I01.H1;\n", + j1c10 => "--:-:-:-:1 $convert_in I02, I01.H0;\n", + j1c12 => "--:-:-:-:1 $convert_in I01, I00.H1;\n", + j1c14 => "--:-:6:-:1 $convert_in I00, I00.H0;\n", + + j2c5 => "--:-:-:-:1 DEPBAR.LE SB1, 1;\n", + j2c8 => "--:-:-:-:1 $convert_in I13, I11.H1;\n", + j2c10 => "--:-:-:-:1 $convert_in I12, I11.H0;\n", + j2c12 => "--:-:-:-:1 $convert_in I11, I10.H1;\n", + j2c14 => "--:-:6:-:1 $convert_in I10, I10.H0;\n", + + j3c5 => "--:-:-:-:1 DEPBAR.LE SB2, 1;\n", + j3c8 => "--:-:-:-:1 $convert_in I23, I21.H1;\n", + j3c10 => "--:-:-:-:1 $convert_in I22, I21.H0;\n", + j3c12 => "--:-:-:-:1 $convert_in I21, I20.H1;\n", + j3c14 => "--:-:6:-:1 $convert_in I20, I20.H0;\n", + + j4c5 => "--:-:-:-:1 DEPBAR.LE SB2, 1;\n", + j4c8 => "--:-:-:-:1 $convert_in I33, I31.H1;\n", + j4c10 => "--:-:-:-:1 $convert_in I32, I31.H0;\n", + j4c12 => "--:-:-:-:1 $convert_in I31, I30.H1;\n", + j4c14 => "--:-:6:-:1 $convert_in I30, I30.H0;\n", + + j5c8 => "08:-:-:-:1 $convert_in E03, E01.H1;\n", + j5c10 => "--:-:-:-:1 $convert_in E02, E01.H0;\n", + j5c12 => "--:-:-:-:1 $convert_in E01, E00.H1;\n", + j5c14 => "--:-:4:-:1 $convert_in E00, E00.H0;\n", + + j6c8 => "10:-:-:-:1 $convert_in E13, E11.H1;\n", + j6c10 => "--:-:-:-:1 $convert_in E12, E11.H0;\n", + j6c12 => "--:-:-:-:1 $convert_in E11, E10.H1;\n", + j6c14 => "--:-:5:-:1 $convert_in E10, E10.H0;\n", + ) : ( + j1c27 => "--:-:-:-:1 DEPBAR.LE SB1, 1;\n", + j2c27 => "--:-:-:-:1 DEPBAR.LE SB1, 1;\n", + j2c27 => "--:-:-:-:1 DEPBAR.LE SB2, 1;\n", + j4c27 => "--:-:-:-:1 DEPBAR.LE SB2, 1;\n", + ), + + j1c30 => "20:-:-:-:1 STS [writeIs + 4x<0*64 + 0*16>], I00;\n", + j1c32 => "--:-:-:-:1 STS [writeIs + 4x<1*64 + 0*16>], I01;\n", + j1c34 => "--:-:-:-:1 STS [writeIs + 4x<2*64 + 0*16>], I02;\n", + j1c36 => "--:6:-:-:1 STS [writeIs + 4x<3*64 + 0*16>], I03;\n", + j1c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];\n", + j1c60 => "20:-:2:-:1 \@P0 LDG.E.CI.$vec_size I0, [track0I];\n", + + j2c30 => "20:-:-:-:1 STS [writeIs + 4x<0*64 + 1*16>], I10;\n", + j2c32 => "--:-:-:-:1 STS [writeIs + 4x<1*64 + 1*16>], I11;\n", + j2c34 => "--:-:-:-:1 STS [writeIs + 4x<2*64 + 1*16>], I12;\n", + j2c36 => "--:6:-:-:1 STS [writeIs + 4x<3*64 + 1*16>], I13;\n", + j2c38 => "--:-:-:-:1 \@!P1 LDS.U.$vec_size I1, [addr_zero];\n", + j2c60 => "20:-:2:-:1 \@P1 LDG.E.CI.$vec_size I1, [track1I];\n", + + j3c30 => "20:-:-:-:1 STS [writeIs + 4x<0*64 + 2*16>], I20;\n", + j3c32 => "--:-:-:-:1 STS [writeIs + 4x<1*64 + 2*16>], I21;\n", + j3c34 => "--:-:-:-:1 STS [writeIs + 4x<2*64 + 2*16>], I22;\n", + j3c36 => "--:6:-:-:1 STS [writeIs + 4x<3*64 + 2*16>], I23;\n", + j3c38 => "--:-:-:-:1 \@!P2 LDS.U.$vec_size I2, [addr_zero];\n", + j3c60 => "20:-:3:-:1 \@P2 LDG.E.CI.$vec_size I2, [track2I];\n", + + j4c30 => "20:-:-:-:1 STS [writeIs + 4x<0*64 + 3*16>], I30;\n", + j4c32 => "--:-:-:-:1 STS [writeIs + 4x<1*64 + 3*16>], I31;\n", + j4c34 => "--:-:-:-:1 STS [writeIs + 4x<2*64 + 3*16>], I32;\n", + j4c36 => "--:6:-:-:1 STS [writeIs + 4x<3*64 + 3*16>], I33;\n", + j4c38 => "--:-:-:-:1 \@!P3 LDS.U.$vec_size I3, [addr_zero];\n", + j4c60 => "20:-:3:-:1 \@P3 LDG.E.CI.$vec_size I3, [track3I];\n", + + j5c7 => "--:-:-:-:1 R2P PR, predE, 0x0f;\n", + + j5c30 => "08:-:-:-:1 STS [writeEs + 4x<0*32 + 0*16>], E00;\n", + j5c32 => "--:-:-:-:1 STS [writeEs + 4x<1*32 + 0*16>], E01;\n", + j5c34 => "--:-:-:-:1 STS [writeEs + 4x<2*32 + 0*16>], E02;\n", + j5c36 => "--:4:-:-:1 STS [writeEs + 4x<3*32 + 0*16>], E03;\n", + j5c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size E0, [addr_zero];\n", + j5c60 => "08:-:4:-:1 \@P0 LDG.E.CI.$vec_size E0, [track0E];\n", + + j6c30 => "10:-:-:-:1 STS [writeEs + 4x<0*32 + 1*16>], E10;\n", + j6c32 => "--:-:-:-:1 STS [writeEs + 4x<1*32 + 1*16>], E11;\n", + j6c34 => "--:-:-:-:1 STS [writeEs + 4x<2*32 + 1*16>], E12;\n", + j6c36 => "--:5:-:-:1 STS [writeEs + 4x<3*32 + 1*16>], E13;\n", + j6c38 => "--:-:-:-:1 \@!P1 LDS.U.$vec_size E1, [addr_zero];\n", + j6c60 => "10:6:5:-:1 \@P1 LDG.E.CI.$vec_size E1, [track1E];\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 IADD writeEs, writeEs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j7c15 => "--:-:-:-:1 PSETP.OR.AND P4, PT, P5, P6, PT;\n", + j7c17 => "--:-:-:-:1 IADD n, n, param_loopN;\n", + j7c27 => "--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, P4;\n", + + $largeN ? ( + j7c30 => "20:-:-:-:1 IADD track0I0.CC, track0I0, param_loopNp;\n", + j7c35 => "--:-:-:-:1 IADD.X track0I1, track0I1, RZ;\n" . + "--:-:-:-:1 IADD track1I0.CC, track1I0, param_loopNp;\n", + j7c40 => "--:-:-:-:1 IADD.X track1I1, track1I1, RZ;\n" . + "--:-:-:-:1 IADD track2I0.CC, track2I0, param_loopNp;\n", + j7c45 => "--:-:-:-:1 IADD.X track2I1, track2I1, RZ;\n" . + "--:-:-:-:1 IADD track3I0.CC, track3I0, param_loopNp;\n", + j7c50 => "--:-:-:-:1 IADD.X track3I1, track3I1, RZ;\n" . + "--:-:-:-:1 IADD track0E0.CC, track0E0, param_loopNp;\n", + j7c55 => "--:-:-:-:1 IADD.X track0E1, track0E1, RZ;\n" . + "--:-:-:-:1 IADD track1E0.CC, track1E0, param_loopNp;\n", + j7c60 => "--:-:-:-:1 IADD.X track1E1, track1E1, RZ;\n", + ) : (), + + j7c63 => "--:-:-:Y:5 \@P4 BRA.U MAIN_LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) & 7; + my $shift = ((($j + 1) & 7) >> 2) << 2; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy0, [readIs + 4x<%d*64 + 00 + %d>];\n", $nOdd, $rsOffset, $shift; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 LDS.U.128 j%dEx0, [readEs + 4x<%d*32 + 00 + %d>];\n", $nOdd, $rsOffset, $shift; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy4, [readIs + 4x<%d*64 + 32 + %d>];\n", $nOdd, $rsOffset, $shift; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 LDS.U.128 j%dEx4, [readEs + 4x<%d*32 + 16 + %d>];\n", $nOdd, $rsOffset, $shift; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1; + + my $yield = $c == 25 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] +// Advance x/q offsets+preds + +--:-:-:-:1 IADD x0, x0, param_loopX; +--:-:-:-:1 IADD x1, x1, param_loopX; +--:-:-:-:1 IADD x2, x2, param_loopX; +--:-:-:-:1 IADD x3, x3, param_loopX; +20:-:-:-:1 IADD track0I0.CC, track0I0, param_loopXp; +--:-:-:-:1 IADD.X track0I1, track0I1, RZ; +--:-:-:-:1 IADD track1I0.CC, track1I0, param_loopXp; +--:-:-:-:1 IADD.X track1I1, track1I1, RZ; +--:-:-:-:1 IADD track2I0.CC, track2I0, param_loopXp; +--:-:-:-:1 IADD.X track2I1, track2I1, RZ; +--:-:-:-:1 IADD track3I0.CC, track3I0, param_loopXp; +--:-:-:-:1 IADD.X track3I1, track3I1, RZ; + +--:-:-:-:1 SHR.U32 predI, predI, 4; +--:-:-:-:1 @P6 R2P PR, predI, 0x0f; +--:-:-:-:1 SHL predI, predI, 4; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x0, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, P3; +--:-:-:-:1 ISETP.GE.AND P0, PT, x0, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R predI, PR, predI, 0x0f; + +--:-:-:-:1 IADD q, q, param_loopQ; +--:-:-:-:1 ISETP.LT.AND P4, PT, q, param_Q, PT; +--:-:-:-:1 @!P4 LOP.AND predE, predE, 0xc; + +--:-:-:-:1 IADD track0E0.CC, track0E0, param_loopQp; +--:-:-:-:1 IADD.X track0E1, track0E1, RZ; +--:-:-:-:1 IADD track1E0.CC, track1E0, param_loopQp; + +--:-:-:-:1 IADD idx_Q, idx_Q, param_strideQ; +--:-:-:-:1 ISETP.LT.AND P5, PT, idx_Q, param_gridQ, P6; + +--:-:-:-:1 LOP.AND n, tid7, param_superN; +--:-:-:-:1 SHL n, n, 2; + + +--:-:-:-:0 IADD.X track1E1, track1E1, RZ; +--:-:-:Y:5 @P5 BRA.U MAIN_LOOP; + +// Advance y/p offsets+preds + +--:-:-:-:1 MOV idx_Q, start_Q; +--:-:-:-:1 IADD idx_P, idx_P, param_strideP; + +--:-:-:-:1 PSETP.AND.AND P5, PT, PT, PT, PT; +--:-:-:Y:d ISETP.LT.AND P6, PT, idx_P, param_gridP, PT; + +--:-:-:Y:5 @!P6 BRA.U FINISH_LOOP; +--:-:-:-:5 CAL CALC_OFFSETS; +--:-:-:Y:5 @P6 BRA.U MAIN_LOOP; + +// Set n to loop remaining times +FINISH_LOOP: +--:-:-:-:1 LOP.AND.NZ P5, RZ, init, 3; +--:-:-:-:1 MOV predI, RZ; +--:-:-:-:1 MOV predE, RZ; +--:-:-:-:1 MOV loopN, param_loopN; +--:-:-:Y:8 MOV N, param_N; +--:-:-:-:1 VMAD.U16.U16 n, -init, loopN, N; +--:-:-:-:0 MOV init, RZ; +01:-:-:Y:5 @P5 BRA.U MAIN_LOOP; + + +--:-:1:-:2 S2R Tid, SR_TID.X; + +01:-:-:-:1 SHR.U32 tid_32, Tid, 5; +--:-:-:-:1 LOP.AND tid_31, Tid, 31; + +// readFs = (tid_32 << 7 + tid_31) << 2 +--:-:-:-:1 ISCADD readFs, tid_32, tid_31, 7; +--:-:-:-:1 SHL readFs, readFs, 2; + +// kk = idx_K*32 + tid31; +--:-:-:-:1 ISCADD kk, idx_K, tid_31, 5; +// kk < K +--:-:-:-:1 ISETP.LT.AND P4, PT, kk, param_K, PT; + +// crst = idx_C*64 + tid_32*4 +--:-:-:-:1 SHL tid_32, tid_32, 2; +--:-:-:-:1 ISCADD crst00, idx_C, tid_32, 6; +--:-:-:-:1 IADD crst04, crst00, 16; +--:-:-:-:1 IADD crst08, crst00, 32; +--:-:-:-:1 IADD crst12, crst00, 48; + +--:-:-:-:1 MOV K, param_K; +--:-:-:-:1 SHL K1, K, 2; +--:-:-:-:1 SHL K16, K, 6; + +--:-:-:-:1 MOV alpha, param_alpha; + +// trackF += crst*K + k; +--:-:-:-:1 XMAD.LO2 tf, crst00, K, kk; +[+ + our $determ; + return $determ ? q{ +// idx_MPQ = idx_M * grid_PQ + idx_P * grid_Q + idx_Q +// trackF += idx_MPQ * CRSTK +--:-:-:-:1 XMAD idx_MPQ, start_P, param_strideQ, start_Q; +--:-:-:-:1 XMAD.LO2C idx_MPQ, idx_M, param_stridePQ, idx_MPQ; +--:-:-:-:1 XMAD.LO tf, idx_MPQ, param_CTRSK, tf, xmad_determ; + } : ''; ++] +--:-:-:-:1 LEA track00F0.CC, tf, param_F[0], 2; +--:-:-:-:1 LEA.HI.X track00F1, tf, param_F[1], RZ, 2; +--:-:-:-:1 IADD track04F0.CC, track00F0, K16; +--:-:-:-:1 IADD.X track04F1, track00F1, RZ; +--:-:-:-:1 IADD track08F0.CC, track04F0, K16; +--:-:-:-:1 IADD.X track08F1, track04F1, RZ; +--:-:-:-:1 IADD track12F0.CC, track08F0, K16; +--:-:-:-:1 IADD.X track12F1, track08F1, RZ; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:1 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:1 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:1 STS.128 [writeFs+4x<0*128 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeFs+4x<0*128 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeFs+4x<1*128 + 00>], shuffle_x0y1; +--:-:-:-:1 STS.128 [writeFs+4x<1*128 + 16>], shuffle_x4y1; +--:-:-:-:1 STS.128 [writeFs+4x<2*128 + 00>], shuffle_x0y2; +--:-:-:-:1 STS.128 [writeFs+4x<2*128 + 16>], shuffle_x4y2; +--:-:-:-:1 STS.128 [writeFs+4x<3*128 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeFs+4x<3*128 + 16>], shuffle_x4y3; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_F; +--:-:-:-:0 IADD readFs, readFs, 4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_F; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:0 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:1 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:1 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:1 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:1 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:1 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:1 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:1 STS.128 [writeFs+4x<0*128 + 00>], shuffle_x0y4; +--:-:-:-:1 STS.128 [writeFs+4x<0*128 + 16>], shuffle_x4y4; +--:-:-:-:1 STS.128 [writeFs+4x<1*128 + 00>], shuffle_x0y5; +--:-:-:-:1 STS.128 [writeFs+4x<1*128 + 16>], shuffle_x4y5; +--:-:-:-:1 STS.128 [writeFs+4x<2*128 + 00>], shuffle_x0y6; +--:-:-:-:1 STS.128 [writeFs+4x<2*128 + 16>], shuffle_x4y6; +--:-:-:-:1 STS.128 [writeFs+4x<3*128 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeFs+4x<3*128 + 16>], shuffle_x4y7; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:0 IADD readFs, readFs, -4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_F; +--:-:-:-:0 IADD readFs, readFs, 4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_F; + +--:-:-:-:5 EXIT; + +STORE_F: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, crst00, param_CTRS, P4; // crst00 < CRST && k < K +--:-:-:-:1 IADD crst00, crst00, 1; +--:-:-:-:1 ISETP.LT.AND P1, PT, crst04, param_CTRS, P4; // crst04 < CRST && k < K +--:-:-:-:1 IADD crst04, crst04, 1; +--:-:-:-:1 ISETP.LT.AND P2, PT, crst08, param_CTRS, P4; // crst08 < CRST && k < K +--:-:-:-:1 IADD crst08, crst08, 1; +--:-:-:-:1 ISETP.LT.AND P3, PT, crst12, param_CTRS, P4; // crst12 < CRST && k < K +--:-:-:-:1 IADD crst12, crst12, 1; + +--:-:-:-:1 LDS f00_0, [readFs + 4x< 0*128 + 0*32 + 0*16>]; +--:-:-:-:1 LDS f00_1, [readFs + 4x< 0*128 + 1*32 + 0*16>]; +--:-:-:-:1 LDS f00_2, [readFs + 4x< 0*128 + 2*32 + 0*16>]; +--:-:1:Y:1 LDS f00_3, [readFs + 4x< 0*128 + 3*32 + 0*16>]; +--:-:-:-:1 LDS f04_0, [readFs + 4x< 4*128 + 0*32 + 1*16>]; +--:-:-:-:1 LDS f04_1, [readFs + 4x< 4*128 + 1*32 + 1*16>]; +--:-:-:-:1 LDS f04_2, [readFs + 4x< 4*128 + 2*32 + 1*16>]; +--:-:2:Y:1 LDS f04_3, [readFs + 4x< 4*128 + 3*32 + 1*16>]; +--:-:-:-:1 LDS f08_0, [readFs + 4x< 8*128 + 0*32 + 2*16>]; +--:-:-:-:1 LDS f08_1, [readFs + 4x< 8*128 + 1*32 + 2*16>]; +--:-:-:-:1 LDS f08_2, [readFs + 4x< 8*128 + 2*32 + 2*16>]; +--:-:3:Y:1 LDS f08_3, [readFs + 4x< 8*128 + 3*32 + 2*16>]; +--:-:-:-:1 LDS f12_0, [readFs + 4x<12*128 + 0*32 + 3*16>]; +--:-:-:-:1 LDS f12_1, [readFs + 4x<12*128 + 1*32 + 3*16>]; +--:-:-:-:1 LDS f12_2, [readFs + 4x<12*128 + 2*32 + 3*16>]; +--:-:4:Y:1 LDS f12_3, [readFs + 4x<12*128 + 3*32 + 3*16>]; + + + +01:-:-:-:1 FADD f00_0, f00_0, f00_1; +--:-:-:-:1 FADD f00_2, f00_2, f00_3; +02:-:-:-:1 FADD f04_0, f04_0, f04_1; +--:-:-:-:1 FADD f04_2, f04_2, f04_3; +04:-:-:-:1 FADD f08_0, f08_0, f08_1; +--:-:-:-:1 FADD f08_2, f08_2, f08_3; +08:-:-:-:1 FADD f12_0, f12_0, f12_1; +--:-:-:-:1 FADD f12_2, f12_2, f12_3; + +--:-:-:-:1 FADD f00_0, f00_0, f00_2; +--:-:-:-:2 FADD f04_0, f04_0, f04_2; +--:-:-:-:2 FADD f08_0, f08_0, f08_2; +--:-:-:-:0 FADD f12_0, f12_0, f12_2; + +01:1:-:-:1 @P0 [+ output_op() +] [track00F], f00_0; +02:2:-:-:1 @P1 [+ output_op() +] [track04F], f04_0; +04:3:-:-:1 @P2 [+ output_op() +] [track08F], f08_0; +08:4:-:-:1 @P3 [+ output_op() +] [track12F], f12_0; + +01:-:-:-:6 IADD track00F0.CC, track00F0, K1; +--:-:-:-:1 IADD.X track00F1, track00F1, RZ; +02:-:-:-:6 IADD track04F0.CC, track04F0, K1; +--:-:-:-:1 IADD.X track04F1, track04F1, RZ; +04:-:-:-:6 IADD track08F0.CC, track08F0, K1; +--:-:-:-:1 IADD.X track08F1, track08F1, RZ; +08:-:-:-:6 IADD track12F0.CC, track12F0, K1; +--:-:-:-:0 IADD.X track12F1, track12F1, RZ; + +--:-:-:-:5 RET; \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/xconv_direct_xprop_64x32.sass b/Kernel/Convolution/Pascal/xconv_direct_xprop_64x32.sass new file mode 100644 index 0000000..4720ab8 --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_direct_xprop_64x32.sass @@ -0,0 +1,2477 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + our ($type, $SN, $N2, $N1); + our $LN = !($SN || $N2 || $N1); + our $dtype = $type eq 'h' ? 'U16' : '32'; + our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; + our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; + our $vsize = $type eq 'h' ? '64' : '128'; + our $dshift = $type eq 'h' ? '1' : '2'; + our $dsize = $type eq 'h' ? '2' : '4'; + our $slice_scale = $N1 ? 4 : $N2 ? 3 : 2; + our $slice_offset = 1 << $slice_scale; + our $slice_load = 8 << $slice_scale; + sub dtype { return $dtype; } + sub dshift { return $dshift; } + sub vsize { return $vsize; } + our $vsizeI; + if ($type eq 'h') + { $vsizeI = $N1 ? 'U16' : $N2 ? '32' : '64'; } + else + { $vsizeI = $N1 ? '32' : $N2 ? '64' : '128'; } +-] + + + + addr_zero : 4x<(32 + 64)*32*2> + addr_szLut : 4x<(32 + 64)*32*2 + 4> + addr_lut4 : 4x<(32 + 64)*32*2 + 4> + addr_lut : 4x<(32 + 64)*32*2 + 6> + + szShareF : (64*32) + szShareI : (32*32) + + param_Sum[0] : c[0x0][0x140] + param_Sum[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_C : c[0x0][0x174] + param_D : c[0x0][0x178] + param_H : c[0x0][0x17c] + param_W : c[0x0][0x180] + param_N : c[0x0][0x184] + param_K : c[0x0][0x188] + param_M : c[0x0][0x18c] + param_P : c[0x0][0x190] + param_Q : c[0x0][0x194] + param_str_d : c[0x0][0x198] + param_str_h : c[0x0][0x19c] + param_str_w : c[0x0][0x1a0] + param_pad_d : c[0x0][0x1a4] + param_pad_h : c[0x0][0x1a8] + param_pad_w : c[0x0][0x1ac] + param_dil_d : c[0x0][0x1b0] + param_dil_h : c[0x0][0x1b4] + param_dil_w : c[0x0][0x1b8] + param_DHWN : c[0x0][0x1bc] + param_HWN : c[0x0][0x1c0] + param_WN : c[0x0][0x1c4] + param_MPQN : c[0x0][0x1c8] + param_PQN : c[0x0][0x1cc] + param_QN : c[0x0][0x1d0] + param_PQnk : c[0x0][0x1d4] + param_Qnk : c[0x0][0x1d8] + param_nk : c[0x0][0x1dc] + param_n : c[0x0][0x1e0] + param_k : c[0x0][0x1e4] + param_magic_PQnk : c[0x0][0x1e8] + param_shift_PQnk : c[0x0][0x1ec] + param_magic_Qnk : c[0x0][0x1f0] + param_shift_Qnk : c[0x0][0x1f4] + param_magic_nk : c[0x0][0x1f8] + param_shift_nk : c[0x0][0x1fc] + param_magic_k : c[0x0][0x200] + param_shift_k : c[0x0][0x204] + param_Km32 : c[0x0][0x208] + param_K32p : c[0x0][0x20c] + param_TRSK : c[0x0][0x210] + param_TRS : c[0x0][0x214] + param_RS : c[0x0][0x218] + param_S : c[0x0][0x21c] + param_magic_RS : c[0x0][0x220] + param_shift_RS : c[0x0][0x224] + param_magic_S : c[0x0][0x228] + param_shift_S : c[0x0][0x22c] + param_gridP2 : c[0x0][0x230] + param_gridQ : c[0x0][0x234] + param_gridN : c[0x0][0x238] + param_gridQN : c[0x0][0x23c] + param_gridPQN : c[0x0][0x240] + param_gridMPQN : c[0x0][0x244] + param_superM : c[0x0][0x248] + param_superP : c[0x0][0x24c] + param_superQ : c[0x0][0x250] + param_superN : c[0x0][0x254] + param_shiftM : c[0x0][0x258] + param_shiftP : c[0x0][0x25c] + param_shiftQ : c[0x0][0x260] + param_shiftN : c[0x0][0x264] + param_SuperM : c[0x0][0x268] + param_SuperP : c[0x0][0x26c] + param_SuperQ : c[0x0][0x270] + param_SuperN : c[0x0][0x274] + param_magic_str_d : c[0x0][0x278] + param_shift_str_d : c[0x0][0x27c] + param_magic_str_h : c[0x0][0x280] + param_shift_str_h : c[0x0][0x284] + param_magic_str_w : c[0x0][0x288] + param_shift_str_w : c[0x0][0x28c] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Fy<0-7>, j0Ix<0-7> + 80-95 : j1Fy<0-7>, j1Ix<0-7> + + 96-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>, I0<0-3>, I1<0-3> + 120-131 : track0F<0-1>, track1F<0-1>, track2F<0-1>, track3F<0-1>, track0I<0-1>, track1I<0-1> + + 64-83 ~ tidY, m, p, q, negOne, trs, lutStore2, lut_size, warp_count, warp_inc, neg_RS, neg_S, dep_thd_mask, qs, pr, mt, neg_str_w, neg_str_h, neg_str_d + + 84-131 ~ idx_MPQnk, idx_PQnk, idx_Qnk, idx_nk, idx_n, idx_k, magic_PQnk, magic_Qnk, neg_PQnk, neg_Qnk, neg_nk, neg_k, div1, div2, div3, idx_P2, idx_Q2, super_m, super_p, super_q, super_n, tid1, tid2, tid3, tid7, tid8, tid31, tid32, readIs2, tidX, k<0|1|2|3>, sb, warp_mask, mask_shr, shiftSB, maskSB, q<1|2|3> + + 84-131 ~ rs, t, r, s, z, y, x, x<1|2|3>, z_prime, y_prime, x_prime, x_prime<1|2|3>, z_mod, y_mod, x_mod, x_mod<1|2|3>, lutStore, ballot, warp_slices, dep_thd_bits, dep_thd_cnt, tidY1 + +[+ + our ($SN, $N2, $N1); + return $N1 ? q{ + 132-135 : slice0I<0-3> + 168-171 : slice1I<0-3> + 172-183 : track0I<2-3>, track0I<4-5>, track0I<6-7>, track1I<2-3>, track1I<4-5>, track1I<6-7> + 184-185 ~ predsI + + } : $N2 ? q{ + 132-135 : slice0I<0-1>, slice1I<0-1> + 168-171 : track0I<2-3>, track1I<2-3> + + } : $SN ? q{ + 132-135 ~ slice0I, slice1I + + } : q{ + 132-133 : sliceI, sliceF + 132-133 : sliceIF<0-1> + 132-135 : sliceI0, sliceF0, sliceI1, sliceF1 + 132-135 : slice0IF<0-1>, slice1IF<0-1> + }; ++] + + 136-151 ~ posCTRS, endCTRS, endCTRS32, lutSize, lutSizeRcp, lutSizeM1, posCTRSf, channel, lutOffset0, lutOffset1, offsetIc0, offsetIc1, offsetFc0, offsetFc1, partial + 152-167 ~ tid, idx_K, idx_M, idx_P, idx_Q, idx_N, k, n, writeFs, writeIs, readFs, readIs, swapBuf, writeOs, preds, sb_offset + + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + + 64-95 ~ o00_<0-3>, o04_<0-3>, o08_<0-3>, o12_<0-3>, b<00|04|08|12>, x<00|04|08|12>, bsum<00|04|08|12> + 96-131 ~ tid_31, tid_32, alpha, readOs, MPQN16, MPQN4, k<00|04|08|12>, offset, one, M, P, Q, N, super_M, super_P, super_Q, super_N, bsum_offset + 0-7 : Out00_<0-1>, Out04_<0-1>, Out08_<0-1>, Out12_<0-1> + 8-15 : Sum00_<0-1>, Sum04_<0-1>, Sum08_<0-1>, Sum12_<0-1> + 16-31 ~ out<00|04|08|12>, sum<00|04|08|12> + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_MPQnk, SR_CTAID.X; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; +--:-:4:-:1 S2R idx_N, SR_CTAID.Z; + + +// tidX = (tid & 7) << 2 +// tidY = tid >> 3 << 1 +01:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHL tidX, tid7, 2; +--:-:-:-:1 SHR.U32 tid3, tid, 3; +--:-:-:-:1 SHL tidY, tid3, 1; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// idx_M = idx_MPQnk / blk_PQnk +--:-:-:-:1 MOV magic_PQnk, param_magic_PQnk; +--:-:-:-:1 ISETP.NE.AND P0, PT, magic_PQnk, 1, PT; +02:-:-:-:1 @P0 XMAD div1, idx_MPQnk, magic_PQnk, RZ; +--:-:-:-:1 @P0 XMAD div2, idx_MPQnk, magic_PQnk.H1, RZ; +--:-:-:-:1 @P0 XMAD div3, idx_MPQnk.H1, magic_PQnk.H1, RZ; +--:-:-:-:1 @P0 XMAD.CHI div1, idx_MPQnk.H1, magic_PQnk, div1; +--:-:-:-:1 @P0 IADD3.RS idx_M, div1, div2, div3; +--:-:-:-:1 @P0 SHR.U32 idx_M, idx_M, param_shift_PQnk; +--:-:-:-:1 @!P0 SHR.U32 idx_M, idx_MPQnk, param_shift_PQnk; + +// idx_PQnk = idx_PQnk % blk_Qnk +--:-:-:-:1 IADD neg_PQnk, RZ, -param_PQnk; +--:-:-:-:1 XMAD.LO2 idx_PQnk, neg_PQnk, idx_M, idx_MPQnk; + +// idx_P2 = idx_PQnk / blk_Qnk +--:-:-:-:1 MOV magic_Qnk, param_magic_Qnk; +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_Qnk, 1, PT; +--:-:-:-:1 @P1 XMAD div1, idx_PQnk, magic_Qnk, RZ; +--:-:-:-:1 @P1 XMAD div2, idx_PQnk, magic_Qnk.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, idx_PQnk.H1, magic_Qnk.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk, div1; +--:-:-:-:1 @P1 IADD3.RS idx_P2, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 idx_P2, idx_P2, param_shift_Qnk; +--:-:-:-:1 @!P1 SHR.U32 idx_P2, idx_PQnk, param_shift_Qnk; + +// idx_Qnk = idx_PQnk % blk_Qnk +--:-:-:-:1 IADD neg_Qnk, RZ, -param_Qnk; +--:-:-:-:1 XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk; + +// idx_Q2 = idx_Qnk / nk +--:-:-:-:1 XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ; +--:-:-:-:1 SHR.U32 idx_Q2, idx_Q2, param_shift_nk; +// idx_nk = idx_Qnk % nk +--:-:-:-:1 IADD neg_nk, RZ, -param_nk; +--:-:-:-:1 XMAD.S16.U16 idx_nk, neg_nk, idx_Q2, idx_Qnk; + +// idx_n = idx_nk / k +--:-:-:-:1 XMAD idx_n, idx_nk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_n, idx_n, param_shift_k; +// idx_k = idx_nk % k +--:-:-:-:1 IADD neg_k, RZ, -param_k; +--:-:-:-:1 XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk; + +// idx_N = idx_N * blk_n + idx_n +// idx_K = idx_K * blk_k + idx_k +08:-:-:-:1 XMAD idx_N, idx_N, param_n, idx_n; +04:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +--:-:-:-:1 ISCADD k, idx_K, tidX, 6; + + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// idx_P = idx_P2 * 2 +// idx_Q = idx_Q2 +// if idx_P2 != gridP2: +// idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1) +// idx_Q = idx_Q2 >> 1 +--:-:-:-:1 ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT; +--:-:-:-:1 SHL idx_P, idx_P2, 1; +--:-:-:-:1 @P1 LOP.AND q1, idx_Q2, 1; +--:-:-:-:1 @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P1 LOP.XOR q1, q1, q2; +--:-:-:-:1 @P1 IADD idx_P, idx_P, q1; +--:-:-:-:1 @P1 SHR.U32 idx_Q, idx_Q2, 1; +--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2; + +// Scan backwards on odd rows +// if idx_P2 & 1: +// idx_Q = gridQ - idx_Q - 1 +--:-:-:-:1 LOP.AND.NZ P0, RZ, idx_P2, 1; +--:-:-:-:1 MOV negOne, -1; +--:-:-:-:1 @P0 IADD3 idx_Q, -idx_Q, param_gridQ, negOne; + +// writeFs = (tidY*64 + tidX) * 4 +--:-:-:-:1 ISCADD writeFs, tidY, tidX, 6; +--:-:-:-:1 SHL writeFs, writeFs, 2; + +// writeIs = (tidY*32 + tidX) * 4 +--:-:-:-:1 ISCADD writeIs, tidY, tidX, 5; +--:-:-:-:1 ISCADD writeIs, writeIs, 4x, 2; + + +// readIs = (((tid >> 1) & 3) << 4 +--:-:-:-:1 BFE.U32 readIs, tid, 0x201; // 2 bits at position 1 + +// readFs = (((tid & 24) >> 2) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 24; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 LOP.OR readFs, readFs, tid1; + +// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5) +// tid32 = tid & -32 +--:-:-:-:1 LOP.AND tid32, tid, -32; + +// readIs2 = readIs + (tid32 >> 2) + (readFs << 2) +--:-:-:-:1 SHR.U32 readIs2, tid32, 2; +--:-:-:-:1 IADD readIs2, readIs2, readIs; +--:-:-:-:1 ISCADD readIs2, readFs, readIs2, 2; + +--:-:-:-:1 SHL readFs, readFs, 4; +--:-:-:-:1 SHL readIs, readIs, 4; +--:-:-:-:1 SHL readIs2, readIs2, 4; + +// writeFs = readFs*32*4 + readIs2 +--:-:-:-:1 ISCADD writeOs, readFs, readIs2, 7; + +// Each block of 32 threads works on 8 lines, +// readFs += tid32/4 * 64 * 4 +// readIs += tid32/4 * 32 * 4 + 4x +--:-:-:-:1 ISCADD readFs, tid32, readFs, 6; +--:-:-:-:1 ISCADD readIs, tid32, readIs, 5; +--:-:-:-:1 IADD readIs, readIs, 4x; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +[+ + our $K1; + return $K1 ? q{ +--:-:-:-:1 IADD k0, k, 32; +--:-:-:-:1 IADD k1, k, 33; +--:-:-:-:1 IADD k2, k, 34; +--:-:-:-:1 IADD k3, k, 35; +--:-:-:-:1 ISETP.LT.AND P0, PT, k0, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, param_K, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +--:-:-:-:1 SHL preds, preds, 4; + +--:-:-:-:1 IADD k1, k, 1; +--:-:-:-:1 IADD k2, k, 2; +--:-:-:-:1 IADD k3, k, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, param_K, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, param_K, PT; +--:-:-:-:1 P2R preds, PR, preds, 0x0f; + } : ''; ++] + +[+ + our ($SN, $N2, $N1); + return $N1 ? q{ +--:-:-:-:1 SHL m, idx_M, param_shiftM; +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; + +--:-:-:-:1 BFE.U32 super_m, tid7, param_superM; +--:-:-:-:1 BFE.U32 super_p, tid7, param_superP; +--:-:-:-:1 BFE.U32 super_q, tid7, param_superQ; + +--:-:-:-:1 IADD m, m, super_m; +--:-:-:-:1 IADD p, p, super_p; +--:-:-:-:1 ISCADD q, super_q, q, 2; +--:-:-:-:1 IADD q1, q, 1; +--:-:-:-:1 IADD q2, q, 2; +--:-:-:-:1 IADD q3, q, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, m, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, p, param_P, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, q, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, q1, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, q2, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, q3, param_Q, P4; +--:-:-:-:1 P2R predsI, PR, RZ, 0x0f; + +// warp_count = 16 +// warp_inc = 16 +// trs = tid3 +--:-:-:-:1 MOV warp_count, 16; +--:-:-:-:1 MOV warp_inc, 16; +--:-:-:-:1 MOV trs, tid3; +// compute shared memory super-block offset into the lookup table +// sb_offset = tid7 * TRS * 4 * 4 +--:-:-:-:1 XMAD sb_offset, tid7, param_TRS, RZ; +--:-:-:-:1 SHL sb_offset, sb_offset, 4; + + } : $N2 ? q{ + +--:-:-:-:1 SHL m, idx_M, param_shiftM; +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; + +--:-:-:-:1 BFE.U32 super_m, tid7, param_superM; +--:-:-:-:1 BFE.U32 super_p, tid7, param_superP; +--:-:-:-:1 BFE.U32 super_q, tid7, param_superQ; + +--:-:-:-:1 IADD m, m, super_m; +--:-:-:-:1 IADD p, p, super_p; +--:-:-:-:1 ISCADD q, super_q, q, 1; +--:-:-:-:1 IADD q1, q, 1; + +--:-:-:-:1 ISETP.LT.AND P4, PT, m, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, p, param_P, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, q, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, q1, param_Q, P4; + +// warp_count = 16 +// warp_inc = 16 +// trs = tid3 +--:-:-:-:1 MOV warp_count, 16; +--:-:-:-:1 MOV warp_inc, 16; +--:-:-:-:1 MOV trs, tid3; +// compute shared memory super-block offset into the lookup table +// sb_offset = tid7 * TRS * 4 * 2 +--:-:-:-:1 XMAD sb_offset, tid7, param_TRS, RZ; +--:-:-:-:1 SHL sb_offset, sb_offset, 3; + + } : $SN ? q{ +--:-:-:-:1 SHL m, idx_M, param_shiftM; +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; +--:-:-:-:1 SHL n, idx_N, param_shiftN; + +--:-:-:-:1 BFE.U32 super_m, tid7, param_superM; +--:-:-:-:1 BFE.U32 super_p, tid7, param_superP; +--:-:-:-:1 BFE.U32 super_q, tid7, param_superQ; +--:-:-:-:1 LOP.AND super_n, tid7, param_superN; + +--:-:-:-:1 IADD m, m, super_m; +--:-:-:-:1 IADD p, p, super_p; +--:-:-:-:1 IADD q, q, super_q; +--:-:-:-:1 ISCADD n, super_n, n, 2; + +--:-:-:-:1 ISETP.LT.AND P0, PT, m, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, p, param_P, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, q, param_Q, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, n, param_N, P0; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P2; + +// sb = tid7 >> (shiftN - 2): 0-1,0-3,0-7 +--:-:-:-:1 MOV shiftSB, param_shiftN; +--:-:-:-:1 IADD shiftSB, shiftSB, -2; +--:-:-:-:1 SHR.U32 sb, tid7, shiftSB; +// warp_count = 4 << shiftN: 64,32,16 +--:-:-:-:1 MOV warp_count, 4; +--:-:-:-:1 SHL warp_count, warp_count, param_shiftN; +--:-:-:-:1 MOV warp_inc, warp_count; +// maskSB = (1 << shiftSB) - 1: 3,1,0 +--:-:-:-:1 MOV maskSB, 1; +--:-:-:-:1 SHL maskSB, maskSB, shiftSB; +--:-:-:-:1 IADD maskSB, maskSB, -1; +// trs = tid3 << shiftSB + (tid7 & mask) +--:-:-:-:1 LOP.AND maskSB, tid7, maskSB; +--:-:-:-:1 SHL trs, tid3, shiftSB; +--:-:-:-:1 IADD trs, trs, maskSB; +// compute shared memory super-block offset into the lookup table +// sb_offset = sb * TRS * 4 +--:-:-:-:1 XMAD sb_offset, sb, param_TRS, RZ; +--:-:-:-:1 SHL sb_offset, sb_offset, 2; + + } : q{ +--:-:-:-:1 SHL n, idx_N, 5; +--:-:-:-:1 ISCADD n, tid7, n, 2; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +--:-:-:-:1 MOV trs, tid; +--:-:-:-:1 MOV lutStore2, RZ; +--:-:-:-:1 MOV lut_size, RZ; +--:-:-:-:1 MOV warp_count, 32; +--:-:-:-:1 MOV warp_inc, 32; + +--:-:-:-:1 IADD mask_shr, -tid, 32; +--:-:-:-:1 SHR.U32 dep_thd_mask, negOne, mask_shr; + +--:-:-:-:1 ISETP.GE.AND P6, PT, tid, 32, PT; + + }; ++] +--:-:-:-:1 IADD neg_RS, RZ, -param_RS; +--:-:-:-:1 IADD neg_S, RZ, -param_S; + +[+ + our ($LN, $prop); + my ($m, $p, $q) = $LN ? qw(idx_M idx_P idx_Q) : qw(m p q); + return $prop eq 'f' ? qq{ +// mt = m * str_d - pad_d +// pr = p * str_h - pad_h +// qs = q * str_w - pad_w +--:-:-:-:1 XMAD mt, $m, param_str_d, RZ; +--:-:-:-:1 XMAD pr, $p, param_str_h, RZ; +--:-:-:-:1 XMAD qs, $q, param_str_w, RZ; +--:-:-:-:1 IADD mt, mt, -param_pad_d; +--:-:-:-:1 IADD pr, pr, -param_pad_h; +--:-:-:-:1 IADD qs, qs, -param_pad_w; + } : qq{ +// mt = m - pad_d +// pr = p - pad_h +// qs = q - pad_w +--:-:-:-:1 IADD mt, $m, -param_pad_d; +--:-:-:-:1 IADD pr, $p, -param_pad_h; +--:-:-:-:1 IADD qs, $q, -param_pad_w; + +--:-:-:-:1 IADD neg_str_d, RZ, -param_str_d; +--:-:-:-:1 IADD neg_str_h, RZ, -param_str_h; +--:-:-:-:1 IADD neg_str_w, RZ, -param_str_w; + }; ++] + + +[+ + our $LN; return $LN ? q{ +--:-:-:-:5 @P6 BRA.U END_SETUP; + } : ''; ++] + +LUT_LOOP: + + +// warp synchronous loop while warp_count < RST +--:-:-:-:1 ISETP.LT.AND P6, PT, warp_count, param_TRS, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, trs, param_TRS, PT; + +--:-:-:-:1 IADD warp_count, warp_count, warp_inc; +// t = trs / RS +// rs = trs % RS +--:-:-:-:1 XMAD.U16.U16 t, trs, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD.U16.S16 rs, t, neg_RS, trs; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.U16.U16 r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD.U16.S16 s, r, neg_S, rs; + +[+ + our ($SN, $N2, $N1, $prop); + if ($prop eq 'f') + { + return $N1 ? q{ +// x = qs + (s * dil_w) +// y = pr + (r * dil_h) +// z = mt + (t * dil_d) +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 IADD x1, x, param_str_w; +--:-:-:-:1 IADD x2, x1, param_str_w; +--:-:-:-:1 IADD x3, x2, param_str_w; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y, RZ, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P5; +--:-:-:-:1 @P4 R2P PR, predsI, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, P3; + +// sliceI = z*HWN + y*WN + x +01:-:-:-:1 XMAD.LO2C slice0I0, z, param_HWN, x; +--:-:-:-:1 XMAD.LO2C slice0I0, y, param_WN, slice0I0; +--:-:-:-:1 IADD slice0I1, slice0I0, param_str_w; +--:-:-:-:1 IADD slice0I2, slice0I1, param_str_w; +--:-:-:-:1 IADD slice0I3, slice0I2, param_str_w; + +--:-:-:-:1 @!P0 MOV slice0I0, -1; +--:-:-:-:1 @!P1 MOV slice0I1, -1; +--:-:-:-:1 @!P2 MOV slice0I2, -1; +--:-:-:-:1 @!P3 MOV slice0I3, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 4; +--:-:-:-:1 IADD trs, trs, warp_inc; + +--:1:-:-:1 @P5 STS.128 [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : $N2 ? q{ + +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 IADD x1, x, param_str_w; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y, RZ, P5; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P3; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; + +// sliceI = z*HWN + y*WN + x*2 +01:-:-:-:1 XMAD.LO2C slice0I0, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I0, y, param_WN, slice0I0; +--:-:-:-:1 ISCADD slice0I1, x1, slice0I0, 1; +--:-:-:-:1 ISCADD slice0I0, x, slice0I0, 1; + +--:-:-:-:1 @!P0 MOV slice0I0, -1; +--:-:-:-:1 @!P1 MOV slice0I1, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 3; +--:-:-:-:1 IADD trs, trs, warp_inc; + +--:1:-:-:1 @P5 STS.64 [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : $SN ? q{ + +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P2, PT, x, RZ, P5; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x, param_W, P2; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD.LO2C slice0I, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I, y, param_WN, slice0I; +--:-:-:-:1 XMAD slice0I, x, param_N, slice0I; + + +--:-:-:-:1 @!P0 MOV slice0I, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 2; +--:-:-:-:1 IADD trs, trs, warp_inc; + + +--:1:-:-:1 @P5 STS [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : q{ + +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD x, s, param_dil_w, qs; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P2, PT, x, RZ, P5; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x, param_W, P2; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD sliceI, x, param_N, sliceI; +// sliceF = trs * K +--:-:-:-:1 XMAD sliceF, trs, param_K, RZ; + + + +// Get a mask of all valid slices in the warp +--:-:-:-:1 VOTE.ANY ballot, PT, P0; +// Count the total valid slices +--:-:2:-:1 POPC warp_slices, ballot; +// Prepare lutStore for this and next loop +--:-:-:-:1 @P0 MOV lutStore, lutStore2; +02:-:-:-:1 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +--:-:-:-:1 @P0 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +--:-:3:-:1 @P0 POPC dep_thd_cnt, dep_thd_bits; +// use the trs increment to space the barrier sync +--:-:-:-:1 IADD trs, trs, warp_inc; +// Update the lutStore address from this count +04:-:-:-:1 @P0 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +--:1:-:-:1 @P0 STS.64 [lutStore + addr_lut], sliceIF; + +// Keep track of the total size of the lut +--:-:-:-:1 IADD lut_size, lut_size, warp_slices; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +--:1:-:-:2 STS [addr_szLut], lut_size; + }; + } + else # bprop + { + return $N1 ? q{ + +// x_prime = qs + s +// y_prime = pr + r +// z_prime = mt + t +--:-:-:-:1 XMAD z_prime, t, param_dil_d, mt; +--:-:-:-:1 XMAD y_prime, r, param_dil_h, pr; +--:-:-:-:1 XMAD x_prime, s, param_dil_w, qs; +--:-:-:-:1 IADD3 x_prime1, qs, 1, s; +--:-:-:-:1 IADD3 x_prime2, qs, 2, s; +--:-:-:-:1 IADD3 x_prime3, qs, 3, s; + +// z = z_prime / str_d +// z_mod = z_prime % str_d +--:-:-:-:1 XMAD z, z_prime, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z, z, param_shift_str_d; +--:-:-:-:1 XMAD.U16.S16 z_mod, z, neg_str_d, z_prime; +// y = y_prime / str_h +// y_mod = y_prime % str_h +--:-:-:-:1 XMAD y, y_prime, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y, y, param_shift_str_h; +--:-:-:-:1 XMAD.U16.S16 y_mod, y, neg_str_h, y_prime; +// x = x_prime / str_w +// x_mod = x_prime % str_w +--:-:-:-:1 XMAD x, x_prime, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x, x, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod, x, neg_str_w, x_prime; + +--:-:-:-:1 XMAD x1, x_prime1, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x1, x1, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod1, x1, neg_str_w, x_prime1; + +--:-:-:-:1 XMAD x2, x_prime2, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x2, x2, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod2, x2, neg_str_w, x_prime2; + +--:-:-:-:1 XMAD x3, x_prime3, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x3, x3, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod3, x3, neg_str_w, x_prime3; + + +--:-:-:-:1 ISETP.GE.AND P0, PT, z_prime, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y_prime, RZ, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.EQ.AND P0, PT, z_mod, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, y_mod, RZ, P1; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P5; +--:-:-:-:1 @P4 R2P PR, predsI, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 ISETP.GE.AND P0, PT, x_prime, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x_prime1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x_prime2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x_prime3, RZ, P3; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, P3; +--:-:-:-:1 ISETP.EQ.AND P0, PT, x_mod, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, x_mod1, RZ, P1; +--:-:-:-:1 ISETP.EQ.AND P2, PT, x_mod2, RZ, P2; +--:-:-:-:1 ISETP.EQ.AND P3, PT, x_mod3, RZ, P3; + +// sliceI = z*HWN + y*WN + x +01:-:-:-:1 XMAD.LO2C slice0I0, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I0, y, param_WN, slice0I0; +--:-:-:-:1 IADD slice0I1, slice0I0, x1; +--:-:-:-:1 IADD slice0I2, slice0I0, x2; +--:-:-:-:1 IADD slice0I3, slice0I0, x3; +--:-:-:-:1 IADD slice0I0, slice0I0, x; + +--:-:-:-:1 @!P0 MOV slice0I0, -1; +--:-:-:-:1 @!P1 MOV slice0I1, -1; +--:-:-:-:1 @!P2 MOV slice0I2, -1; +--:-:-:-:1 @!P3 MOV slice0I3, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 4; +--:-:-:-:1 IADD trs, trs, warp_inc; + +--:1:-:-:1 @P5 STS.128 [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : $N2 ? q{ + +// x_prime = qs + s +// y_prime = pr + r +// z_prime = mt + t +--:-:-:-:1 XMAD z_prime, t, param_dil_d, mt; +--:-:-:-:1 XMAD y_prime, r, param_dil_h, pr; +--:-:-:-:1 XMAD x_prime, s, param_dil_w, qs; +--:-:-:-:1 IADD3 x_prime1, qs, 1, s; +--:-:-:-:1 IADD3 x_prime2, qs, 2, s; +--:-:-:-:1 IADD3 x_prime3, qs, 3, s; + +// z = z_prime / str_d +// z_mod = z_prime % str_d +--:-:-:-:1 XMAD z, z_prime, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z, z, param_shift_str_d; +--:-:-:-:1 XMAD.U16.S16 z_mod, z, neg_str_d, z_prime; +// y = y_prime / str_h +// y_mod = y_prime % str_h +--:-:-:-:1 XMAD y, y_prime, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y, y, param_shift_str_h; +--:-:-:-:1 XMAD.U16.S16 y_mod, y, neg_str_h, y_prime; +// x = x_prime / str_w +// x_mod = x_prime % str_w +--:-:-:-:1 XMAD x, x_prime, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x, x, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod, x, neg_str_w, x_prime; + +--:-:-:-:1 XMAD x1, x_prime1, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x1, x1, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_mod1, x1, neg_str_w, x_prime1; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z_prime, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y_prime, RZ, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.EQ.AND P0, PT, z_mod, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, y_mod, RZ, P1; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P3; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + +--:-:-:-:1 ISETP.GE.AND P0, PT, x_prime, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x_prime1, RZ, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_W, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, P1; +--:-:-:-:1 ISETP.EQ.AND P0, PT, x_mod, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, x_mod1, RZ, P1; + +// sliceI = z*HWN + y*WN + x*2 +01:-:-:-:1 XMAD.LO2C slice0I0, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I0, y, param_WN, slice0I0; +--:-:-:-:1 ISCADD slice0I1, x1, slice0I0, 1; +--:-:-:-:1 ISCADD slice0I0, x, slice0I0, 1; + +--:-:-:-:1 @!P0 MOV slice0I0, -1; +--:-:-:-:1 @!P1 MOV slice0I1, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 3; +--:-:-:-:1 IADD trs, trs, warp_inc; + +--:1:-:-:1 @P5 STS.64 [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : $SN ? q{ +// x_prime = qs + s +// y_prime = pr + r +// z_prime = mt + t +--:-:-:-:1 XMAD z_prime, t, param_dil_d, mt; +--:-:-:-:1 XMAD y_prime, r, param_dil_h, pr; +--:-:-:-:1 XMAD x_prime, s, param_dil_w, qs; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z_prime, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y_prime, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P2, PT, x_prime, RZ, P5; + +// z = z_prime / str_d +// z_prime = z_prime % str_d +--:-:-:-:1 XMAD z, z_prime, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z, z, param_shift_str_d; +--:-:-:-:1 XMAD.U16.S16 z_prime, z, neg_str_d, z_prime; +// y = y_prime / str_h +// y_prime = y_prime % str_h +--:-:-:-:1 XMAD y, y_prime, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y, y, param_shift_str_h; +--:-:-:-:1 XMAD.U16.S16 y_prime, y, neg_str_h, y_prime; +// x = x_prime / str_w +// x_prime = x_prime % str_w +--:-:-:-:1 XMAD x, x_prime, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x, x, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_prime, x, neg_str_w, x_prime; + +--:-:-:-:1 ISETP.EQ.AND P0, PT, z_prime, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, y_prime, RZ, P1; +--:-:-:-:1 ISETP.EQ.AND P2, PT, x_prime, RZ, P2; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x, param_W, P2; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD.LO2C slice0I, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C slice0I, y, param_WN, slice0I; +--:-:-:-:1 XMAD slice0I, x, param_N, slice0I; + + +--:-:-:-:1 @!P0 MOV slice0I, -1; +--:-:-:-:1 ISCADD lutStore, trs, sb_offset, 2; +--:-:-:-:1 IADD trs, trs, warp_inc; + + +--:1:-:-:1 @P5 STS [lutStore + addr_lut4], slice0I; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + + } : q{ +// x_prime = qs + s +// y_prime = pr + r +// z_prime = mt + t +--:-:-:-:1 XMAD z_prime, t, param_dil_d, mt; +--:-:-:-:1 XMAD y_prime, r, param_dil_h, pr; +--:-:-:-:1 XMAD x_prime, s, param_dil_w, qs; + +--:-:-:-:1 ISETP.GE.AND P0, PT, z_prime, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, y_prime, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P2, PT, x_prime, RZ, P5; + +// z = z_prime / str_d +// z_prime = z_prime % str_d +--:-:-:-:1 XMAD z, z_prime, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z, z, param_shift_str_d; +--:-:-:-:1 XMAD.U16.S16 z_prime, z, neg_str_d, z_prime; +// y = y_prime / str_h +// y_prime = y_prime % str_h +--:-:-:-:1 XMAD y, y_prime, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y, y, param_shift_str_h; +--:-:-:-:1 XMAD.U16.S16 y_prime, y, neg_str_h, y_prime; +// x = x_prime / str_w +// x_prime = x_prime % str_w +--:-:-:-:1 XMAD x, x_prime, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x, x, param_shift_str_w; +--:-:-:-:1 XMAD.U16.S16 x_prime, x, neg_str_w, x_prime; + +--:-:-:-:1 ISETP.EQ.AND P0, PT, z_prime, RZ, P0; +--:-:-:-:1 ISETP.EQ.AND P1, PT, y_prime, RZ, P1; +--:-:-:-:1 ISETP.EQ.AND P2, PT, x_prime, RZ, P2; +--:-:-:-:1 ISETP.LT.AND P0, PT, z, param_D, P0; +--:-:-:-:1 ISETP.LT.AND P1, PT, y, param_H, P1; +--:-:-:-:1 ISETP.LT.AND P2, PT, x, param_W, P2; +--:-:-:-:1 PSETP.AND.AND P0, PT, P0, P1, P2; + + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD sliceI, x, param_N, sliceI; +// sliceF = trs * K +--:-:-:-:1 XMAD sliceF, trs, param_K, RZ; + + + +// Get a mask of all valid slices in the warp +--:-:-:-:1 VOTE.ANY ballot, PT, P0; +// Count the total valid slices +--:-:2:-:1 POPC warp_slices, ballot; +// Prepare lutStore for this and next loop +--:-:-:-:1 @P0 MOV lutStore, lutStore2; +02:-:-:-:1 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +--:-:-:-:1 @P0 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +--:-:3:-:1 @P0 POPC dep_thd_cnt, dep_thd_bits; +// use the trs increment to space the barrier sync +--:-:-:-:1 IADD trs, trs, warp_inc; +// Update the lutStore address from this count +04:-:-:-:1 @P0 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +--:1:-:-:1 @P0 STS.64 [lutStore + addr_lut], sliceIF; + +// Keep track of the total size of the lut +--:-:-:-:1 IADD lut_size, lut_size, warp_slices; + + +--:-:-:-:5 @P6 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +--:1:-:-:2 STS [addr_szLut], lut_size; + }; + } ++] + +END_SETUP: + +01:-:-:-:5 BAR.SYNC 0; + +// Grab the caclulated lut size and get it's reciprical +// Get the total reduction depth +[+ + our $LN; return $LN ? q{ +--:-:1:-:2 LDS lutSize, [addr_szLut]; + } : q{ +--:-:-:-:6 MOV lutSize, param_TRS; + }; ++] +01:-:-:-:0 XMAD endCTRS, lutSize, param_C, RZ; +--:-:1:-:2 I2F.F32.S32 lutSizeRcp, lutSize; +--:-:-:-:0 IADD lutSizeM1, lutSize, -1; +01:-:1:-:1 MUFU.RCP lutSizeRcp, lutSizeRcp; + + +--:-:-:-:1 IADD endCTRS32, endCTRS, 32; +// posCTRS = tidY +//--:-:-:-:1 MOV posCTRS, tidY; +// If this value is not a multiple of 32 we want to grab the partial amount on the first fetch. +// If it is a multiple of 32 then make a full 32 line fetch. +--:-:-:-:1 LOP.AND.Z P5, partial, endCTRS, 31; +--:-:-:-:1 @P5 MOV partial, 32; +// channel = posCTRS / lutSize +// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it +--:-:2:-:1 I2F.F32.S32 posCTRSf, tidY; +03:-:-:-:1 FMUL channel, posCTRSf, lutSizeRcp; +--:-:-:-:1 FFMA channel, channel, 5.9604644775390625e-08, channel; +--:-:2:-:1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCTRS % lutSize) * 8 +02:-:-:-:1 VMAD.U16.U16 lutOffset0, -channel, lutSize, tidY; + +--:-:-:-:1 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT; + +// posCTRS = tidY + partial +--:-:-:-:1 IADD posCTRS, tidY, partial; +--:-:-:-:1 IADD tidY1, tidY, 1; +[+ + our ($N1, $N2, $LN, $dshift, $slice_scale, $slice_offset, $slice_load); + return $LN ? q{ +// P5 = tidY < partial && lutSize != 0 +--:-:-:-:1 LOP.AND.NZ P6, RZ, lutSize, -1; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, partial, P6; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY1, partial, P6; + +--:-:-:-:1 SHL lutOffset0, lutOffset0, 3; + +// offsetFC = channel * KRST +// offsetIC = channel * DHWN +--:-:-:-:1 XMAD.LO2C offsetIc0, channel, param_DHWN, RZ; +--:-:-:-:1 XMAD offsetFc0, channel, param_TRSK, RZ; + +--:-:-:-:1 @P0 IADD lutOffset1, lutOffset0, 8; +--:-:-:-:1 @P0 MOV offsetFc1, offsetFc0; +--:-:-:-:1 @P0 MOV offsetIc1, offsetIc0; +--:-:-:-:1 @!P0 MOV lutOffset1, RZ; +--:-:-:-:1 @!P0 IADD offsetFc1, offsetFc0, param_TRSK; +--:-:-:-:1 @!P0 IADD offsetIc1, offsetIc0, param_DHWN; + +--:-:5:-:1 @P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut]; +--:-:6:-:1 @P6 LDS.U.64 slice1IF, [lutOffset1 + addr_lut]; + } : qq{ +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, partial, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY1, partial, PT; + +--:-:-:-:1 XMAD offsetFc0, tidY, param_K, RZ; +--:-:-:-:1 XMAD offsetFc1, tidY1, param_K, RZ; + +--:-:-:-:1 XMAD partial, partial, param_K, RZ; +--:-:-:-:1 SHL partial, partial, $dshift; + +--:-:-:-:1 ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale; +--:-:-:-:1 XMAD.LO2C offsetIc0, channel, param_DHWN, RZ; + +--:-:-:-:1 \@P0 IADD lutOffset1, lutOffset0, $slice_offset; +--:-:-:-:1 \@P0 MOV offsetIc1, offsetIc0; +--:-:-:-:1 \@!P0 MOV lutOffset1, sb_offset; +--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN; + +--:-:5:-:1 \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4]; +--:-:6:-:1 \@P6 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4]; + }; ++] + + + +[+ + our ($LN, $dshift); + return $LN ? qq{ +10:-:-:-:1 IADD3 offsetFc0, offsetFc0, sliceF0, k; +--:-:-:-:1 LEA track0F0.CC, offsetFc0, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track0F1, offsetFc0, param_F[1], RZ, $dshift; + +20:-:-:-:1 IADD3 offsetFc1, offsetFc1, sliceF1, k; +--:-:-:-:1 LEA track1F0.CC, offsetFc1, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1F1, offsetFc1, param_F[1], RZ, $dshift; + } : qq{ +--:-:-:-:1 IADD offsetFc0, offsetFc0, k; +--:-:-:-:1 IADD offsetFc1, offsetFc1, k; +--:-:-:-:1 LEA track0F0.CC, offsetFc0, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track0F1, offsetFc0, param_F[1], RZ, $dshift; +--:-:-:-:1 LEA track1F0.CC, offsetFc1, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1F1, offsetFc1, param_F[1], RZ, $dshift; + }; ++] +[+ + our ($K1, $dtype, $vsize, $dsize); + return $K1 ? qq{ +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; +--:-:-:-:1 \@!P0 MOV F00, RZ; +--:-:-:-:1 \@!P1 MOV F01, RZ; +--:-:-:-:1 \@!P2 MOV F02, RZ; +--:-:-:-:1 \@!P3 MOV F03, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>]; +--:-:1:-:1 \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>]; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.L.U64 preds, preds, 4, preds; +--:-:-:-:1 \@!P0 MOV F10, RZ; +--:-:-:-:1 \@!P1 MOV F11, RZ; +--:-:-:-:1 \@!P2 MOV F12, RZ; +--:-:-:-:1 \@!P3 MOV F13, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>]; + +--:-:-:-:1 \@P6 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P6 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; +--:-:-:-:1 \@!P0 MOV F20, RZ; +--:-:-:-:1 \@!P1 MOV F21, RZ; +--:-:-:-:1 \@!P2 MOV F22, RZ; +--:-:-:-:1 \@!P3 MOV F23, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>]; +--:-:3:-:1 \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>]; + +--:-:-:-:1 \@P6 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P6 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.L.U64 preds, preds, 4, preds; +--:-:-:-:1 \@!P0 MOV F30, RZ; +--:-:-:-:1 \@!P1 MOV F31, RZ; +--:-:-:-:1 \@!P2 MOV F32, RZ; +--:-:-:-:1 \@!P3 MOV F33, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>]; +--:-:4:-:1 \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>]; + } : qq{ + +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k, param_Km32, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k, param_K, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, k, param_Km32, P6; + + +--:-:1:-:1 \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>]; +--:-:2:-:1 \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>]; +--:-:3:-:1 \@P2 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>]; +--:-:4:-:1 \@P3 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>]; + +--:-:-:-:1 \@!P0 LDS.U.$vsize F0, [addr_zero]; +--:-:-:-:1 \@!P1 LDS.U.$vsize F1, [addr_zero]; +--:-:-:-:1 \@!P2 LDS.U.$vsize F2, [addr_zero]; +--:-:1:-:1 \@!P3 LDS.U.$vsize F3, [addr_zero]; + + }; ++] + + +[+ + our ($N1, $N2, $SN, $dshift, $vsizeI); + return $N1 ? qq{ +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P2, PT, slice0I2, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice0I3, RZ, P5; +--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0; +--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0; +--:-:-:-:1 IADD slice0I2, slice0I2, offsetIc0; +--:-:-:-:1 IADD slice0I3, slice0I3, offsetIc0; +--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I4.CC, slice0I2, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I5, slice0I2, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I6.CC, slice0I3, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I7, slice0I3, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0]; +--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I01, [track0I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I02, [track0I4]; +--:-:5:-:1 \@P3 LDG.E.CI.$vsizeI I03, [track0I6]; + +--:-:-:-:1 \@!P0 MOV I00, RZ; +--:-:-:-:1 \@!P1 MOV I01, RZ; +--:-:-:-:1 \@!P2 MOV I02, RZ; +--:-:-:-:1 \@!P3 MOV I03, RZ; + +20:-:-:-:1 ISETP.GE.AND P0, PT, slice1I0, RZ, P6; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice1I1, RZ, P6; +--:-:-:-:1 ISETP.GE.AND P2, PT, slice1I2, RZ, P6; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I3, RZ, P6; +--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1; +--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1; +--:-:-:-:1 IADD slice1I2, slice1I2, offsetIc1; +--:-:-:-:1 IADD slice1I3, slice1I3, offsetIc1; +--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I4.CC, slice1I2, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I5, slice1I2, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I6.CC, slice1I3, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I7, slice1I3, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I10, [track1I0]; +--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I11, [track1I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I12, [track1I4]; +--:-:6:-:1 \@P3 LDG.E.CI.$vsizeI I13, [track1I6]; + +--:-:-:-:1 \@!P0 MOV I10, RZ; +--:-:-:-:1 \@!P1 MOV I11, RZ; +--:-:-:-:1 \@!P2 MOV I12, RZ; +--:-:-:-:1 \@!P3 MOV I13, RZ; + + } : $N2 ? qq{ + +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5; +20:-:-:-:1 ISETP.GE.AND P2, PT, slice1I0, RZ, P6; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I1, RZ, P6; +--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0; +--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0; +--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1; +--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1; +--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0]; +--:-:5:-:1 \@P1 LDG.E.CI.$vsizeI I02, [track0I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I10, [track1I0]; +--:-:6:-:1 \@P3 LDG.E.CI.$vsizeI I12, [track1I2]; +--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero]; +--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero]; +--:-:-:-:1 \@!P2 LDS.U.$vsizeI I10, [addr_zero]; +--:-:5:-:1 \@!P3 LDS.U.$vsizeI I12, [addr_zero]; + + + } : $SN ? qq{ + +10:-:-:-:1 ISETP.GE.AND P5, PT, slice0I, RZ, P5; +20:-:-:-:1 ISETP.GE.AND P6, PT, slice1I, RZ, P6; +--:-:-:-:1 IADD3 slice0I, slice0I, offsetIc0, n; +--:-:-:-:1 IADD3 slice1I, slice1I, offsetIc1, n; +--:-:-:-:1 LEA track0I0.CC, slice0I, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, slice1I, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I, param_I[1], RZ, $dshift; + +--:-:5:-:1 \@P5 LDG.E.CI.$vsizeI I0, [track0I]; +--:-:6:-:1 \@P6 LDG.E.CI.$vsizeI I1, [track1I]; +--:-:-:-:1 \@!P5 LDS.U.$vsizeI I0, [addr_zero]; +--:-:5:-:1 \@!P6 LDS.U.$vsizeI I1, [addr_zero]; + + + } : qq{ +--:-:-:-:1 IADD3 offsetIc0, offsetIc0, sliceI0, n; +--:-:-:-:1 IADD3 offsetIc1, offsetIc1, sliceI1, n; +--:-:-:-:1 LEA track0I0.CC, offsetIc0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, offsetIc0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, offsetIc1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, offsetIc1, param_I[1], RZ, $dshift; + +--:-:5:-:1 \@P5 LDG.E.CI.$vsizeI I0, [track0I]; +--:-:6:-:1 \@P6 LDG.E.CI.$vsizeI I1, [track1I]; +--:-:-:-:1 \@!P5 LDS.U.$vsizeI I0, [addr_zero]; +--:-:5:-:1 \@!P6 LDS.U.$vsizeI I1, [addr_zero]; + + }; ++] + + +[+ + our ($convert_in, $K1); + return !$convert_in ? '' : $K1 ? qq{ +01:-:-:-:1 $convert_in F00, F00; +--:-:-:-:1 $convert_in F01, F01; +--:-:-:-:1 $convert_in F02, F02; +--:-:1:-:1 $convert_in F03, F03; + +02:-:-:-:1 $convert_in F10, F10; +--:-:-:-:1 $convert_in F11, F11; +--:-:-:-:1 $convert_in F12, F12; +--:-:2:-:1 $convert_in F13, F13; + +04:-:-:-:1 $convert_in F20, F20; +--:-:-:-:1 $convert_in F21, F21; +--:-:-:-:1 $convert_in F22, F22; +--:-:3:-:1 $convert_in F23, F23; + +08:-:-:-:1 $convert_in F30, F30; +--:-:-:-:1 $convert_in F31, F31; +--:-:-:-:1 $convert_in F32, F32; +--:-:4:-:1 $convert_in F33, F33; + } : qq{ +01:-:-:-:1 $convert_in F03, F01.H1; +--:-:-:-:1 $convert_in F02, F01.H0; +--:-:-:-:1 $convert_in F01, F00.H1; +--:-:1:-:1 $convert_in F00, F00.H0; + +02:-:-:-:1 $convert_in F13, F11.H1; +--:-:-:-:1 $convert_in F12, F11.H0; +--:-:-:-:1 $convert_in F11, F10.H1; +--:-:2:-:1 $convert_in F10, F10.H0; + +04:-:-:-:1 $convert_in F23, F21.H1; +--:-:-:-:1 $convert_in F22, F21.H0; +--:-:-:-:1 $convert_in F21, F20.H1; +--:-:3:-:1 $convert_in F20, F20.H0; + +08:-:-:-:1 $convert_in F33, F31.H1; +--:-:-:-:1 $convert_in F32, F31.H0; +--:-:-:-:1 $convert_in F31, F30.H1; +--:-:4:-:1 $convert_in F30, F30.H0; + }; ++] +[+ + our ($convert_in, $N1, $N2); + return !$convert_in ? '' : $N1 ? qq{ +10:-:-:-:1 $convert_in I03, I03; +--:-:-:-:1 $convert_in I02, I02; +--:-:-:-:1 $convert_in I01, I01; +--:-:5:-:1 $convert_in I00, I00; + +20:-:-:-:1 $convert_in I13, I13; +--:-:-:-:1 $convert_in I12, I12; +--:-:-:-:1 $convert_in I11, I11; +--:-:6:-:1 $convert_in I10, I10; + } : $N2 ? qq{ +10:-:-:-:1 $convert_in I03, I02.H1; +--:-:-:-:1 $convert_in I02, I02.H0; +--:-:-:-:1 $convert_in I01, I00.H1; +--:-:5:-:1 $convert_in I00, I00.H0; + +20:-:-:-:1 $convert_in I13, I12.H1; +--:-:-:-:1 $convert_in I12, I12.H0; +--:-:-:-:1 $convert_in I11, I10.H1; +--:-:6:-:1 $convert_in I10, I10.H0; + } : qq{ +10:-:-:-:1 $convert_in I03, I01.H1; +--:-:-:-:1 $convert_in I02, I01.H0; +--:-:-:-:1 $convert_in I01, I00.H1; +--:-:5:-:1 $convert_in I00, I00.H0; + +20:-:-:-:1 $convert_in I13, I11.H1; +--:-:-:-:1 $convert_in I12, I11.H0; +--:-:-:-:1 $convert_in I11, I10.H1; +--:-:6:-:1 $convert_in I10, I10.H0; + }; ++] + +01:-:-:-:1 STS.128 [writeFs + 4x<0*32>], F0; +02:-:-:-:1 STS.128 [writeFs + 4x<1*32>], F1; +04:-:-:-:1 STS.128 [writeFs + 4x<2*32>], F2; +08:-:-:-:1 STS.128 [writeFs + 4x<3*32>], F3; + +10:-:-:-:1 STS.128 [writeIs + 4x<0*32>], I0; +20:-:-:-:1 STS.128 [writeIs + 4x<1*32>], I1; + +--:-:-:-:0 ISETP.LT.AND P5, PT, posCTRS, endCTRS, PT; +--:-:5:-:1 I2F.F32.S32 posCTRSf, posCTRS; + +--:-:-:-:5 BAR.SYNC 0; + + +--:-:-:-:1 IADD writeFs, writeFs, swapBuf; +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + + +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ix0, [readIs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*64 + 32>]; +--:-:1:-:1 LDS.U.128 j0Ix4, [readIs + 4x<0*32 + 16>]; + + +10:-:-:-:1 FMUL channel, posCTRSf, lutSizeRcp; +--:-:-:-:1 FFMA channel, channel, 5.9604644775390625e-08, channel; +--:-:5:-:1 F2I.S32.F32.TRUNC channel, channel; + +10:-:-:-:1 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS; +--:-:-:-:1 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT; +[+ + our ($N1, $N2, $LN, $dshift, $slice_scale, $slice_offset, $slice_load); + return $LN ? q{ + +--:-:-:-:1 SHL lutOffset0, lutOffset0, 3; +--:-:-:-:1 XMAD.LO2C offsetIc0, channel, param_DHWN, RZ; +--:-:-:-:1 XMAD offsetFc0, channel, param_TRSK, RZ; + +--:-:-:-:1 @P0 IADD lutOffset1, lutOffset0, 8; +--:-:-:-:1 @P0 MOV offsetFc1, offsetFc0; +--:-:-:-:1 @P0 MOV offsetIc1, offsetIc0; +--:-:-:-:1 @!P0 MOV lutOffset1, RZ; +--:-:-:-:1 @!P0 IADD offsetFc1, offsetFc0, param_TRSK; +--:-:-:-:1 @!P0 IADD offsetIc1, offsetIc0, param_DHWN; + +--:-:-:-:1 IADD posCTRS, posCTRS, 32; +--:-:5:-:1 @P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut]; +--:-:6:-:1 @P5 LDS.U.64 slice1IF, [lutOffset1 + addr_lut]; + + } : qq{ + +--:-:-:-:1 ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale; +--:-:-:-:1 XMAD.LO2C offsetIc0, channel, param_DHWN, RZ; + +--:-:-:-:1 \@P0 IADD lutOffset1, lutOffset0, $slice_offset; +--:-:-:-:1 \@P0 MOV offsetIc1, offsetIc0; +--:-:-:-:1 \@!P0 MOV lutOffset1, sb_offset; +--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN; + +--:-:-:-:1 IADD posCTRS, posCTRS, 32; +--:-:5:-:1 \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4]; +--:-:6:-:1 \@P5 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4]; + }; ++] + + + + +[+ + our ($LN, $dshift); + return $LN ? qq{ +10:-:-:-:1 IADD3 offsetFc0, offsetFc0, sliceF0, k; +--:-:-:-:1 LEA track0F0.CC, offsetFc0, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track0F1, offsetFc0, param_F[1], RZ, $dshift; + +20:-:-:-:1 IADD3 offsetFc1, offsetFc1, sliceF1, k; +--:-:-:-:1 LEA track1F0.CC, offsetFc1, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1F1, offsetFc1, param_F[1], RZ, $dshift; + } : qq{ +--:-:-:-:1 IADD track0F0.CC, track0F0, partial; +--:-:-:-:1 IADD.X track0F1, track0F1, RZ; +--:-:-:-:1 IADD track1F0.CC, track1F0, partial; +--:-:-:-:1 IADD.X track1F1, track1F1, RZ; + }; ++] + +[+ + our ($K1, $dtype, $vsize, $dsize); + return $K1 ? qq{ +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 \@P5 SHF.R.U64 preds, preds, 4, preds; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>]; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 \@P5 SHF.L.U64 preds, preds, 4, preds; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>]; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 \@P5 SHF.R.U64 preds, preds, 4, preds; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>]; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 \@P5 SHF.L.U64 preds, preds, 4, preds; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>]; +--:-:-:-:1 \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>]; +--:-:2:-:1 \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>]; + } : qq{ +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k, param_Km32, P5; + +--:-:2:-:1 \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>]; +--:-:2:-:1 \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>]; +--:-:2:-:1 \@P0 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>]; +--:-:2:-:1 \@P1 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>]; + }; ++] + + + + +[+ + our ($N1, $N2, $SN, $dshift, $vsizeI); + return $N1 ? qq{ + +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P2, PT, slice0I2, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice0I3, RZ, P5; + +--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0; +--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0; +--:-:-:-:1 IADD slice0I2, slice0I2, offsetIc0; +--:-:-:-:1 IADD slice0I3, slice0I3, offsetIc0; +--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I4.CC, slice0I2, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I5, slice0I2, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I6.CC, slice0I3, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I7, slice0I3, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0]; +--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I01, [track0I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I02, [track0I4]; +--:-:2:-:1 \@P3 LDG.E.CI.$vsizeI I03, [track0I6]; + +--:-:-:-:1 \@!P0 MOV I00, RZ; +--:-:-:-:1 \@!P1 MOV I01, RZ; +--:-:-:-:1 \@!P2 MOV I02, RZ; +--:-:-:-:1 \@!P3 MOV I03, RZ; + + +20:-:-:-:1 ISETP.GE.AND P0, PT, slice1I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice1I1, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P2, PT, slice1I2, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I3, RZ, P5; + +--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1; +--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1; +--:-:-:-:1 IADD slice1I2, slice1I2, offsetIc1; +--:-:-:-:1 IADD slice1I3, slice1I3, offsetIc1; +--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I4.CC, slice1I2, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I5, slice1I2, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I6.CC, slice1I3, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I7, slice1I3, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I10, [track1I0]; +--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I11, [track1I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I12, [track1I4]; +--:3:2:-:1 \@P3 LDG.E.CI.$vsizeI I13, [track1I6]; + +--:-:-:-:1 \@!P0 MOV I10, RZ; +--:-:-:-:1 \@!P1 MOV I11, RZ; +--:-:-:-:1 \@!P2 MOV I12, RZ; +--:-:-:-:1 \@!P3 MOV I13, RZ; + + } : $N2 ? qq{ + +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5; +20:-:-:-:1 ISETP.GE.AND P2, PT, slice1I0, RZ, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I1, RZ, P5; + +--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0; +--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0; +--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1; +--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1; +--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0]; +--:-:2:-:1 \@P1 LDG.E.CI.$vsizeI I02, [track0I2]; +--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I10, [track1I0]; +--:3:2:-:1 \@P3 LDG.E.CI.$vsizeI I12, [track1I2]; +--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero]; +--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero]; +--:-:-:-:1 \@!P2 LDS.U.$vsizeI I10, [addr_zero]; +--:-:-:-:1 \@!P3 LDS.U.$vsizeI I12, [addr_zero]; + + + } : $SN ? qq{ + +10:-:-:-:1 ISETP.GE.AND P0, PT, slice0I, RZ, P5; +20:-:-:-:1 ISETP.GE.AND P1, PT, slice1I, RZ, P5; +--:-:-:-:1 IADD3 slice0I, slice0I, offsetIc0, n; +--:-:-:-:1 IADD3 slice1I, slice1I, offsetIc1, n; +--:-:-:-:1 LEA track0I0.CC, slice0I, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, slice0I, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, slice1I, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, slice1I, param_I[1], RZ, $dshift; + +--:-:2:-:1 \@P0 LDG.E.CI.$vsizeI I0, [track0I]; +--:3:2:-:1 \@P1 LDG.E.CI.$vsizeI I1, [track1I]; +--:-:-:-:1 \@!P0 LDS.U.$vsizeI I0, [addr_zero]; +--:-:-:-:1 \@!P1 LDS.U.$vsizeI I1, [addr_zero]; + + + } : qq{ +--:-:-:-:1 IADD3 offsetIc0, offsetIc0, sliceI0, n; +--:-:-:-:1 IADD3 offsetIc1, offsetIc1, sliceI1, n; +--:-:-:-:1 LEA track0I0.CC, offsetIc0, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track0I1, offsetIc0, param_I[1], RZ, $dshift; +--:-:-:-:1 LEA track1I0.CC, offsetIc1, param_I[0], $dshift; +--:-:-:-:1 LEA.HI.X track1I1, offsetIc1, param_I[1], RZ, $dshift; + +--:-:2:-:1 \@P5 LDG.E.CI.$vsizeI I0, [track0I]; +--:3:2:-:1 \@P5 LDG.E.CI.$vsizeI I1, [track1I]; + + }; ++] + + +LOOP: +[+ + our ($N1, $N2, $SN, $LN, $K1, $dtype, $dshift, $dsize, $vsize, $vsizeI, + $convert_in, $slice_scale, $slice_offset, $slice_load); + + my %insert = ( + j0c1 => "--:-:5:-:1 I2F.F32.S32 posCTRSf, posCTRS;\n", + j0c3 => "--:-:-:-:1 ISETP.LT.AND P5, PT, posCTRS, endCTRS, PT;\n", + j0c5 => "--:-:-:-:1 ISETP.LT.AND P6, PT, posCTRS, endCTRS32, PT;\n", + + j0c15 => "10:-:-:-:1 \@P5 FMUL channel, posCTRSf, lutSizeRcp;\n", + j0c20 => "--:-:-:-:1 \@P5 FFMA channel, channel, 5.9604644775390625e-08, channel;\n", + j0c22 => "--:-:5:-:1 \@P5 F2I.S32.F32.TRUNC channel, channel;\n", + + $LN ? ( + j0c36 => "10:-:-:-:1 \@P5 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;\n" . + "--:-:-:-:1 \@P5 XMAD offsetIc0, channel, param_DHWN, RZ;\n" . + "--:-:-:-:1 \@P5 XMAD offsetFc0, channel, param_TRSK, RZ;\n" . + "--:-:-:-:1 IADD posCTRS, posCTRS, 32;\n", + + j0c38 => "--:-:-:-:1 \@P5 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;\n" . + "--:-:-:-:1 \@P5 XMAD.PSL offsetIc0, channel, param_DHWN.H1, offsetIc0;\n" . + "--:-:-:-:1 \@P5 SHL lutOffset0, lutOffset0, 3;\n", + + j0c42 => "--:-:5:-:1 \@P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut];\n", + + j0c49 => "--:-:-:-:1 \@P0 I2I.U32.U32 offsetFc1, offsetFc0;\n" . + "--:-:-:-:1 \@!P0 IADD offsetFc1, offsetFc0, param_TRSK;\n", + + j0c50 => "--:-:-:-:1 \@P0 I2I.U32.U32 offsetIc1, offsetIc0;\n" . + "--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;\n", + + j0c51 => "--:-:4:-:1 \@!P0 I2I.U32.U32 lutOffset1, RZ;\n" . + "--:-:-:-:1 \@P0 IADD lutOffset1, lutOffset0, 8;\n", + + j1c44 => "10:-:-:-:1 \@P5 IADD3 offsetFc0, offsetFc0, sliceF0, k;\n", + j1c49 => "04:-:-:-:1 \@P5 LEA track0F0.CC, offsetFc0, param_F[0], $dshift;\n", + j1c54 => "--:-:-:-:1 \@P5 LEA.HI.X track0F1, offsetFc0, param_F[1], RZ, $dshift;\n", + + j2c16 => "08:-:5:-:1 \@P5 LDS.U.64 slice1IF, [lutOffset1 + addr_lut];\n", + + j3c44 => "10:-:-:-:1 \@P5 IADD3 offsetFc1, offsetFc1, sliceF1, k;\n", + j3c49 => "--:-:-:-:1 \@P5 LEA track1F0.CC, offsetFc1, param_F[0], $dshift;\n", + j3c54 => "--:-:-:-:1 \@P5 LEA.HI.X track1F1, offsetFc1, param_F[1], RZ, $dshift;\n", + + j5c44 => "--:-:-:-:1 \@P5 IADD3 offsetIc0, offsetIc0, sliceI0, n;\n", + j5c49 => "--:-:-:-:1 \@P5 LEA track0I0.CC, offsetIc0, param_I[0], $dshift;\n", + j5c54 => "--:-:-:-:1 \@P5 LEA.HI.X track0I1, offsetIc0, param_I[1], RZ, $dshift;\n", + j5c60 => "20:-:2:-:1 \@P5 LDG.E.CI.$vsize I0, [track0I];\n", + + j6c44 => "--:-:-:-:1 \@P5 IADD3 offsetIc1, offsetIc1, sliceI1, n;\n", + j6c49 => "--:-:-:-:1 \@P5 LEA track1I0.CC, offsetIc1, param_I[0], $dshift;\n", + j6c54 => "--:-:-:-:1 \@P5 LEA.HI.X track1I1, offsetIc1, param_I[1], RZ, $dshift;\n", + j6c60 => "20:3:2:-:1 \@P5 LDG.E.CI.$vsize I1, [track1I];\n", + + ) : ( + j0c36 => "10:-:-:-:1 \@P5 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;\n" . + "--:-:-:-:1 \@P5 XMAD offsetIc0, channel, param_DHWN, RZ;\n" . + "--:-:-:-:1 IADD posCTRS, posCTRS, 32;\n", + + j0c39 => "--:-:-:-:1 \@P5 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;\n" . + "--:-:-:-:1 \@P5 XMAD.PSL offsetIc0, channel, param_DHWN.H1, offsetIc0;\n" . + "--:-:-:-:1 \@P5 ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale;\n", + + j0c43 => "--:-:-:-:1 \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4];\n", + + j0c50 => "--:-:-:-:1 \@P0 I2I.U32.U32 offsetIc1, offsetIc0;\n" . + "--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;\n", + + j0c51 => "--:-:4:-:1 \@!P0 I2I.U32.U32 lutOffset1, sb_offset;\n" . + "--:-:-:-:1 \@P0 IADD lutOffset1, lutOffset0, $slice_offset;\n", + + j2c16 => "08:-:-:-:1 \@P5 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4];\n", + + j1c49 => "04:-:-:-:1 \@P5 IADD track0F0.CC, track0F0, param_K32p;\n", + j1c54 => "--:-:-:-:1 \@P5 IADD.X track0F1, track0F1, RZ;\n", + + j3c49 => "--:-:-:-:1 \@P5 IADD track1F0.CC, track1F0, param_K32p;\n", + j3c54 => "--:-:-:-:1 \@P5 IADD.X track1F1, track1F1, RZ;\n", + ), + + $N1 ? ( + + j5c31 => "--:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0;\n" . + "--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0;\n" . + "--:-:-:-:1 ISETP.GE.AND P2, PT, slice0I2, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I2, slice0I2, offsetIc0;\n" . + "--:-:-:-:1 ISETP.GE.AND P3, PT, slice0I3, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I3, slice0I3, offsetIc0;\n", + + j5c32 => "--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift;\n", + j5c37 => "--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift;\n", + j5c42 => "--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track0I4.CC, slice0I2, param_I[0], $dshift;\n", + j5c47 => "--:-:-:-:1 LEA.HI.X track0I5, slice0I2, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track0I6.CC, slice0I3, param_I[0], $dshift;\n", + j5c52 => "--:-:-:-:1 LEA.HI.X track0I7, slice0I3, param_I[1], RZ, $dshift;\n", + + j5c55 => "20:-:-:-:1 \@!P0 I2I.U32.U32 I00, RZ;\n", + j5c57 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I01, RZ;\n", + j5c59 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I02, RZ;\n", + j5c61 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I03, RZ;\n", + + j5c56 => "--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0];\n", + j5c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I01, [track0I2];\n", + j5c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I02, [track0I4];\n", + j5c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$vsizeI I03, [track0I6];\n", + + j6c31 => "--:-:-:-:1 ISETP.GE.AND P0, PT, slice1I0, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1;\n" . + "--:-:-:-:1 ISETP.GE.AND P1, PT, slice1I1, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1;\n" . + "--:-:-:-:1 ISETP.GE.AND P2, PT, slice1I2, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I2, slice1I2, offsetIc1;\n" . + "--:-:-:-:1 ISETP.GE.AND P3, PT, slice1I3, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I3, slice1I3, offsetIc1;\n", + + j6c32 => "--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift;\n", + j6c37 => "--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift;\n", + j6c42 => "--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track1I4.CC, slice1I2, param_I[0], $dshift;\n", + j6c47 => "--:-:-:-:1 LEA.HI.X track1I5, slice1I2, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track1I6.CC, slice1I3, param_I[0], $dshift;\n", + j6c52 => "--:-:-:-:1 LEA.HI.X track1I7, slice1I3, param_I[1], RZ, $dshift;\n", + + j6c55 => "20:-:-:-:1 \@!P0 I2I.U32.U32 I10, RZ;\n", + j6c57 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I11, RZ;\n", + j6c59 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I12, RZ;\n", + j6c61 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I13, RZ;\n", + + j6c56 => "--:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I10, [track1I0];\n", + j6c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$vsizeI I11, [track1I2];\n", + j6c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$vsizeI I12, [track1I4];\n", + j6c62 => "--:3:2:-:1 \@P3 LDG.E.CI.$vsizeI I13, [track1I6];\n", + + ) : $N2 ? ( + + j5c31 => "--:-:-:-:1 ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I0, slice0I0, offsetIc0;\n" . + "--:-:-:-:1 ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" . + "--:-:-:-:1 IADD slice0I1, slice0I1, offsetIc0;\n", + + j5c35 => "--:-:-:-:1 LEA track0I0.CC, slice0I0, param_I[0], $dshift;\n", + j5c40 => "--:-:-:-:1 LEA.HI.X track0I1, slice0I0, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track0I2.CC, slice0I1, param_I[0], $dshift;\n", + j5c45 => "--:-:-:-:1 LEA.HI.X track0I3, slice0I1, param_I[1], RZ, $dshift;\n", + + j5c46 => "--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero];\n", + j5c47 => "--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero];\n", + + j5c60 => "20:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I00, [track0I0];\n", + j5c62 => "--:-:2:-:1 \@P1 LDG.E.CI.$vsizeI I02, [track0I2];\n", + + j6c31 => "--:-:-:-:1 ISETP.GE.AND P0, PT, slice1I0, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I0, slice1I0, offsetIc1;\n" . + "--:-:-:-:1 ISETP.GE.AND P1, PT, slice1I1, RZ, P5;\n" . + "--:-:-:-:1 IADD slice1I1, slice1I1, offsetIc1;\n", + + j6c35 => "--:-:-:-:1 LEA track1I0.CC, slice1I0, param_I[0], $dshift;\n", + j6c40 => "--:-:-:-:1 LEA.HI.X track1I1, slice1I0, param_I[1], RZ, $dshift;\n" . + "--:-:-:-:1 LEA track1I2.CC, slice1I1, param_I[0], $dshift;\n", + j6c45 => "--:-:-:-:1 LEA.HI.X track1I3, slice1I1, param_I[1], RZ, $dshift;\n", + + j6c46 => "--:-:-:-:1 \@!P0 LDS.U.$vsizeI I10, [addr_zero];\n", + j6c47 => "--:-:-:-:1 \@!P1 LDS.U.$vsizeI I12, [addr_zero];\n", + + j6c60 => "20:-:-:-:1 \@P0 LDG.E.CI.$vsizeI I10, [track1I0];\n", + j6c62 => "--:3:2:-:1 \@P1 LDG.E.CI.$vsizeI I12, [track1I2];\n", + + ) : $SN ? ( + j5c31 => "--:-:-:-:1 ISETP.GE.AND P2, PT, slice0I, RZ, P5;\n", + j5c45 => "--:-:-:-:1 \@!P2 LDS.U.$vsize I0, [addr_zero];\n", + + j5c44 => "--:-:-:-:1 \@P5 IADD3 offsetIc0, offsetIc0, slice0I, n;\n", + j5c49 => "--:-:-:-:1 \@P5 LEA track0I0.CC, offsetIc0, param_I[0], $dshift;\n", + j5c54 => "--:-:-:-:1 \@P5 LEA.HI.X track0I1, offsetIc0, param_I[1], RZ, $dshift;\n", + j5c60 => "20:-:2:-:1 \@P2 LDG.E.CI.$vsize I0, [track0I];\n", + + j6c31 => "--:-:-:-:1 ISETP.GE.AND P2, PT, slice1I, RZ, P5;\n", + j6c45 => "--:-:-:-:1 \@!P2 LDS.U.$vsize I1, [addr_zero];\n", + + j6c44 => "--:-:-:-:1 \@P5 IADD3 offsetIc1, offsetIc1, slice1I, n;\n", + j6c49 => "--:-:-:-:1 \@P5 LEA track1I0.CC, offsetIc1, param_I[0], $dshift;\n", + j6c54 => "--:-:-:-:1 \@P5 LEA.HI.X track1I1, offsetIc1, param_I[1], RZ, $dshift;\n", + j6c60 => "20:3:2:-:1 \@P2 LDG.E.CI.$vsize I1, [track1I];\n", + ) : (), + + j1c30 => "20:6:-:-:1 \@P6 STS.128 [writeFs + 4x<0*32>], F0;\n", + j2c30 => "20:6:-:-:1 \@P6 STS.128 [writeFs + 4x<1*32>], F1;\n", + j3c30 => "20:6:-:-:1 \@P6 STS.128 [writeFs + 4x<2*32>], F2;\n", + j4c30 => "20:6:-:-:1 \@P6 STS.128 [writeFs + 4x<3*32>], F3;\n", + j5c30 => "20:6:-:-:1 \@P6 STS.128 [writeIs + 4x<0*32>], I0;\n", + j6c30 => "20:6:-:-:1 \@P6 STS.128 [writeIs + 4x<1*32>], I1;\n", + + $convert_in ? ( + j1c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j2c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j3c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j4c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j5c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j6c5 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + $K1 ? ( + j1c8 => "--:-:-:-:1 \@P6 $convert_in F00, F00;\n", + j1c10 => "--:-:-:-:1 \@P6 $convert_in F01, F01;\n", + j1c12 => "--:-:-:-:1 \@P6 $convert_in F02, F02;\n", + j1c14 => "--:-:6:-:1 \@P6 $convert_in F03, F03;\n", + + j2c8 => "--:-:-:-:1 \@P6 $convert_in F10, F10;\n", + j2c10 => "--:-:-:-:1 \@P6 $convert_in F11, F11;\n", + j2c12 => "--:-:-:-:1 \@P6 $convert_in F12, F12;\n", + j2c14 => "--:-:6:-:1 \@P6 $convert_in F13, F13;\n", + + j3c8 => "--:-:-:-:1 \@P6 $convert_in F20, F20;\n", + j3c10 => "--:-:-:-:1 \@P6 $convert_in F21, F21;\n", + j3c12 => "--:-:-:-:1 \@P6 $convert_in F22, F22;\n", + j3c14 => "--:-:6:-:1 \@P6 $convert_in F23, F23;\n", + + j4c8 => "--:-:-:-:1 \@P6 $convert_in F30, F30;\n", + j4c10 => "--:-:-:-:1 \@P6 $convert_in F31, F31;\n", + j4c12 => "--:-:-:-:1 \@P6 $convert_in F32, F32;\n", + j4c14 => "--:-:6:-:1 \@P6 $convert_in F33, F33;\n", + ) : ( + j1c8 => "--:-:-:-:1 \@P6 $convert_in F03, F01.H1;\n", + j1c10 => "--:-:-:-:1 \@P6 $convert_in F02, F01.H0;\n", + j1c12 => "--:-:-:-:1 \@P6 $convert_in F01, F00.H1;\n", + j1c14 => "--:-:6:-:1 \@P6 $convert_in F00, F00.H0;\n", + + j2c8 => "--:-:-:-:1 \@P6 $convert_in F13, F11.H1;\n", + j2c10 => "--:-:-:-:1 \@P6 $convert_in F12, F11.H0;\n", + j2c12 => "--:-:-:-:1 \@P6 $convert_in F11, F10.H1;\n", + j2c14 => "--:-:6:-:1 \@P6 $convert_in F10, F10.H0;\n", + + j3c8 => "--:-:-:-:1 \@P6 $convert_in F23, F21.H1;\n", + j3c10 => "--:-:-:-:1 \@P6 $convert_in F22, F21.H0;\n", + j3c12 => "--:-:-:-:1 \@P6 $convert_in F21, F20.H1;\n", + j3c14 => "--:-:6:-:1 \@P6 $convert_in F20, F20.H0;\n", + + j4c8 => "--:-:-:-:1 \@P6 $convert_in F33, F31.H1;\n", + j4c10 => "--:-:-:-:1 \@P6 $convert_in F32, F31.H0;\n", + j4c12 => "--:-:-:-:1 \@P6 $convert_in F31, F30.H1;\n", + j4c14 => "--:-:6:-:1 \@P6 $convert_in F30, F30.H0;\n", + ), + $N1 ? ( + j5c8 => "--:-:-:-:1 \@P6 $convert_in I03, I03;\n", + j5c10 => "--:-:-:-:1 \@P6 $convert_in I02, I02;\n", + j5c12 => "--:-:-:-:1 \@P6 $convert_in I01, I01;\n", + j5c14 => "--:-:6:-:1 \@P6 $convert_in I00, I00;\n", + + j6c8 => "--:-:-:-:1 \@P6 $convert_in I13, I13;\n", + j6c10 => "--:-:-:-:1 \@P6 $convert_in I12, I12;\n", + j6c12 => "--:-:-:-:1 \@P6 $convert_in I11, I11;\n", + j6c14 => "--:-:6:-:1 \@P6 $convert_in I10, I10;\n", + ) : $N2 ? ( + j5c8 => "--:-:-:-:1 \@P6 $convert_in I03, I02.H1;\n", + j5c10 => "--:-:-:-:1 \@P6 $convert_in I02, I02.H0;\n", + j5c12 => "--:-:-:-:1 \@P6 $convert_in I01, I00.H1;\n", + j5c14 => "--:-:6:-:1 \@P6 $convert_in I00, I00.H0;\n", + + j6c8 => "--:-:-:-:1 \@P6 $convert_in I13, I12.H1;\n", + j6c10 => "--:-:-:-:1 \@P6 $convert_in I12, I12.H0;\n", + j6c12 => "--:-:-:-:1 \@P6 $convert_in I11, I10.H1;\n", + j6c14 => "--:-:6:-:1 \@P6 $convert_in I10, I10.H0;\n", + ) : ( + j5c8 => "--:-:-:-:1 \@P6 $convert_in I03, I01.H1;\n", + j5c10 => "--:-:-:-:1 \@P6 $convert_in I02, I01.H0;\n", + j5c12 => "--:-:-:-:1 \@P6 $convert_in I01, I00.H1;\n", + j5c14 => "--:-:6:-:1 \@P6 $convert_in I00, I00.H0;\n", + + j6c8 => "--:-:-:-:1 \@P6 $convert_in I13, I11.H1;\n", + j6c10 => "--:-:-:-:1 \@P6 $convert_in I12, I11.H0;\n", + j6c12 => "--:-:-:-:1 \@P6 $convert_in I11, I10.H1;\n", + j6c14 => "--:-:6:-:1 \@P6 $convert_in I10, I10.H0;\n", + ), + ) : ( + j1c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j2c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j3c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j4c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j5c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + j6c27 => "--:-:-:-:1 DEPBAR.LE SB1, 5;\n", + ), + + $K1 ? ( + j1c31 => "--:-:-:-:1 \@P5 R2P PR, preds, 0x0f;\n", + j1c32 => "--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f;\n", + j1c33 => "--:-:-:-:1 \@P5 SHF.R.U64 preds, preds, 4, preds;\n", + j1c56 => "20:-:-:-:1 \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>];\n", + j1c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>];\n", + j1c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>];\n", + j1c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>];\n", + + j2c31 => "--:-:-:-:1 \@P5 R2P PR, preds, 0x0f;\n", + j2c32 => "--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f;\n", + j2c33 => "--:-:-:-:1 \@P5 SHF.L.U64 preds, preds, 4, preds;\n", + j2c56 => "20:-:-:-:1 \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>];\n", + j2c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>];\n", + j2c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>];\n", + j2c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>];\n", + + j3c31 => "--:-:-:-:1 \@P5 R2P PR, preds, 0x0f;\n", + j3c32 => "--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f;\n", + j3c33 => "--:-:-:-:1 \@P5 SHF.R.U64 preds, preds, 4, preds;\n", + j3c56 => "20:-:-:-:1 \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>];\n", + j3c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>];\n", + j3c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>];\n", + j3c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>];\n", + + j4c31 => "--:-:-:-:1 \@P5 R2P PR, preds, 0x0f;\n", + j4c32 => "--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f;\n", + j4c33 => "--:-:-:-:1 \@P5 SHF.L.U64 preds, preds, 4, preds;\n", + j4c56 => "20:-:-:-:1 \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>];\n", + j4c58 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>];\n", + j4c60 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>];\n", + j4c62 => "--:-:2:-:1 \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>];\n", + + ) : ( + j0c52 => "--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, P5;\n", + j0c53 => "--:-:-:-:1 ISETP.LT.AND P1, PT, k, param_Km32, P5;\n", + + j1c60 => "20:-:2:-:1 \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>];\n", + j2c60 => "20:-:2:-:1 \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>];\n", + j3c60 => "20:-:2:-:1 \@P0 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>];\n", + j4c60 => "20:-:2:-:1 \@P1 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>];\n", + ), + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeFs, writeFs, swapBuf;\n" . + "--:-:-:-:1 IADD writeIs, writeIs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P6 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P6' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx0, [readIs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dIx4, [readIs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + + +--:-:-:-:1 MOV alpha, param_alpha; + +--:-:-:-:1 ISETP.EQ.AND P4, PT, RZ, param_flags, PT; + +--:-:-:-:1 LOP.AND tid_31, tid, 31; +--:-:-:-:1 SHR.U32 tid_32, tid, 5; + +// readFs = (tid_32 << 7 + tid_31) << 2 +--:-:-:-:1 ISCADD readOs, tid_32, tid_31, 7; +--:-:-:-:1 SHL readOs, readOs, 2; + +--:-:-:-:1 ISETP.EQ.AND P6, PT, tid_31, RZ, PT; + +// k = idx_K*64 + tid_32 +--:-:-:-:1 ISCADD k00, idx_K, tid_32, 6; +--:-:-:-:1 IADD k04, k00, 4; +--:-:-:-:1 IADD k08, k00, 8; +--:-:-:-:1 IADD k12, k00, 12; + +[+ + our $bsum; return $bsum ? q{ +--:-:-:-:1 XMAD bsum_offset, idx_Q, param_gridN, idx_N; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_P, param_gridQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset; + } : ''; ++] + +[+ + our $LN; return $LN ? q{ +// n = idx_N*32 + tid31; +--:-:-:-:1 ISCADD N, idx_N, tid_31, 5; +// n < N +--:-:-:-:1 ISETP.LT.AND P4, PT, N, param_N, P4; + +// o = k*MPQN + m*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD offset, idx_Q, param_N, N; +--:-:-:-:1 XMAD.LO2C offset, idx_P, param_QN, offset; +--:-:-:-:1 XMAD.LO2C offset, idx_M, param_PQN, offset; + + } : q{ + +--:-:-:-:1 SHL M, idx_M, param_shiftM; +--:-:-:-:1 SHL P, idx_P, param_shiftP; +--:-:-:-:1 SHL Q, idx_Q, param_shiftQ; +--:-:-:-:1 SHL N, idx_N, param_shiftN; + +--:-:-:-:1 BFE.U32 super_M, tid_31, param_SuperM; +--:-:-:-:1 BFE.U32 super_P, tid_31, param_SuperP; +--:-:-:-:1 BFE.U32 super_Q, tid_31, param_SuperQ; +--:-:-:-:1 LOP.AND super_N, tid_31, param_SuperN; + +--:-:-:-:1 IADD M, M, super_M; +--:-:-:-:1 IADD P, P, super_P; +--:-:-:-:1 IADD Q, Q, super_Q; +--:-:-:-:1 IADD N, N, super_N; + +--:-:-:-:1 ISETP.LT.AND P0, PT, M, param_M, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, P, param_P, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, Q, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, N, param_N, P0; +--:-:-:-:1 PSETP.AND.AND P4, PT, P0, P1, P2; + +// o = k*MPQN + m*PQN + p*QN + q*N + N +--:-:-:-:1 XMAD offset, Q, param_N, N; +--:-:-:-:1 XMAD.LO2C offset, P, param_QN, offset; +--:-:-:-:1 XMAD.LO2C offset, M, param_PQN, offset; + }; ++] +--:-:-:-:1 XMAD.LO2C offset, k00, param_MPQN, offset; + +--:-:-:-:1 MOV MPQN16, param_MPQN; +--:-:-:-:1 SHL MPQN4, MPQN16, [+ dshift()+2 +]; +--:-:-:-:1 SHL MPQN16, MPQN16, 4; + +--:-:-:-:1 MOV32I one, 1.0; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:1 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:1 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:1 STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y1; +--:-:-:-:1 STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y1; +--:-:-:-:1 STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y2; +--:-:-:-:1 STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y2; +--:-:-:-:1 STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y3; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_O; +--:-:-:-:0 IADD readOs, readOs, 4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_O; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:0 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:1 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:1 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:1 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:1 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:1 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:1 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:1 STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y4; +--:-:-:-:1 STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y4; +--:-:-:-:1 STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y5; +--:-:-:-:1 STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y5; +--:-:-:-:1 STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y6; +--:-:-:-:1 STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y6; +--:-:-:-:1 STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y7; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:0 IADD readOs, readOs, -4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_O; +--:-:-:-:0 IADD readOs, readOs, 4x<16*128 + 4*16>; +--:-:-:-:5 CAL STORE_O; + +--:-:-:-:5 EXIT; + +STORE_O: + +--:-:-:-:2 ISETP.LT.AND P0, PT, k00, param_K, P4; // k00 < K && n < N +--:-:-:-:2 ISETP.LT.AND P1, PT, k04, param_K, P4; // k04 < K && n < N +--:-:-:-:2 ISETP.LT.AND P2, PT, k08, param_K, P4; // k08 < K && n < N +--:-:-:-:1 ISETP.LT.AND P3, PT, k12, param_K, P4; // k12 < K && n < N +[+ + our ($beta, $brelu, $bprelu, $dshift, $dtype); + return $beta || $brelu || $bprelu ? qq{ + +01:-:-:-:1 LEA Out00_0.CC, offset, param_X[0], $dshift; +--:-:-:-:1 LEA.HI.X Out00_1, offset, param_X[1], RZ, $dshift; +--:-:-:-:1 IADD Out04_0.CC, Out00_0, MPQN4; +--:-:-:-:1 IADD.X Out04_1, Out00_1, RZ; +--:-:-:-:1 IADD Out08_0.CC, Out04_0, MPQN4; +--:-:-:-:1 IADD.X Out08_1, Out04_1, RZ; +--:-:-:-:1 IADD Out12_0.CC, Out08_0, MPQN4; +--:-:-:-:1 IADD.X Out12_1, Out08_1, RZ; + +--:-:-:-:1 \@P0 LDG.E.CI.$dtype b00, [Out00_0]; +--:-:-:-:1 \@!P0 MOV b00, RZ; +--:-:5:-:1 \@P1 LDG.E.CI.$dtype b04, [Out04_0]; +--:-:-:-:1 \@!P1 MOV b04, RZ; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype b08, [Out08_0]; +--:-:-:-:1 \@!P2 MOV b08, RZ; +--:-:6:-:1 \@P3 LDG.E.CI.$dtype b12, [Out12_0]; +--:-:-:-:1 \@!P3 MOV b12, RZ; + + + } : ''; ++] +[+ + our $bias; + return $bias ? q{ + +20:-:-:-:1 LEA Sum00_0.CC, k00, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum00_1, k00, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum04_0.CC, k04, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum04_1, k04, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum08_0.CC, k08, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum08_1, k08, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum12_0.CC, k12, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum12_1, k12, param_Sum[1], RZ, 2; + +--:-:-:-:1 @P0 LDG.E.CI b00, [Sum00_0]; +--:-:-:-:1 @!P0 MOV b00, RZ; +--:-:5:-:1 @P1 LDG.E.CI b04, [Sum04_0]; +--:-:-:-:1 @!P1 MOV b04, RZ; +--:-:-:-:1 @P2 LDG.E.CI b08, [Sum08_0]; +--:-:-:-:1 @!P2 MOV b08, RZ; +--:-:6:-:1 @P3 LDG.E.CI b12, [Sum12_0]; +--:-:-:-:1 @!P3 MOV b12, RZ; + + + } : ''; ++] +--:-:-:-:1 LDS o00_0, [readOs + 4x< 0*128 + 0*32 + 0*16>]; +--:-:-:-:1 LDS o00_1, [readOs + 4x< 0*128 + 1*32 + 0*16>]; +--:-:-:-:1 LDS o00_2, [readOs + 4x< 0*128 + 2*32 + 0*16>]; +--:-:1:Y:1 LDS o00_3, [readOs + 4x< 0*128 + 3*32 + 0*16>]; +--:-:-:-:1 LDS o04_0, [readOs + 4x< 4*128 + 0*32 + 1*16>]; +--:-:-:-:1 LDS o04_1, [readOs + 4x< 4*128 + 1*32 + 1*16>]; +--:-:-:-:1 LDS o04_2, [readOs + 4x< 4*128 + 2*32 + 1*16>]; +--:-:2:Y:1 LDS o04_3, [readOs + 4x< 4*128 + 3*32 + 1*16>]; +--:-:-:-:1 LDS o08_0, [readOs + 4x< 8*128 + 0*32 + 2*16>]; +--:-:-:-:1 LDS o08_1, [readOs + 4x< 8*128 + 1*32 + 2*16>]; +--:-:-:-:1 LDS o08_2, [readOs + 4x< 8*128 + 2*32 + 2*16>]; +--:-:3:Y:1 LDS o08_3, [readOs + 4x< 8*128 + 3*32 + 2*16>]; +--:-:-:-:1 LDS o12_0, [readOs + 4x<12*128 + 0*32 + 3*16>]; +--:-:-:-:1 LDS o12_1, [readOs + 4x<12*128 + 1*32 + 3*16>]; +--:-:-:-:1 LDS o12_2, [readOs + 4x<12*128 + 2*32 + 3*16>]; +--:-:4:Y:1 LDS o12_3, [readOs + 4x<12*128 + 3*32 + 3*16>]; + + +01:-:-:-:1 FADD o00_0, o00_0, o00_1; +--:-:-:-:1 FADD o00_2, o00_2, o00_3; +02:-:-:-:1 FADD o04_0, o04_0, o04_1; +--:-:-:-:1 FADD o04_2, o04_2, o04_3; +04:-:-:-:1 FADD o08_0, o08_0, o08_1; +--:-:-:-:1 FADD o08_2, o08_2, o08_3; +08:-:-:-:1 FADD o12_0, o12_0, o12_1; +--:-:-:-:1 FADD o12_2, o12_2, o12_3; + +--:-:-:-:1 FADD out00, o00_0, o00_2; +--:-:-:-:1 FADD out04, o04_0, o04_2; +--:-:-:-:1 FADD out08, o08_0, o08_2; +--:-:-:-:3 FADD out12, o12_0, o12_2; +[+ + our $bias; return $bias ? q{ +10:-:-:-:1 FADD out00, out00, b00; +--:-:-:-:1 FADD out04, out04, b04; +20:-:-:-:1 FADD out08, out08, b08; +--:-:-:-:1 FADD out12, out12, b12; + } : ''; ++] +[+ + our $relu; return $relu ? q{ +// maximum(x, 0) +--:-:-:-:1 FMNMX out00, out00, RZ, !PT; +--:-:-:-:1 FMNMX out04, out04, RZ, !PT; +--:-:-:-:1 FMNMX out08, out08, RZ, !PT; +--:-:-:-:1 FMNMX out12, out12, RZ, !PT; + } : ''; ++] +[+ + our $prelu; return $prelu ? q{ +// maximum(x, 0) + slope * minimum(0, x) +--:-:-:-:1 FMNMX b00, out00, RZ, !PT; +--:-:-:-:1 FMNMX b04, out04, RZ, !PT; +--:-:-:-:1 FMNMX b08, out08, RZ, !PT; +--:-:-:-:1 FMNMX b12, out12, RZ, !PT; + +--:-:-:-:1 FMNMX x00, out00, RZ, PT; +--:-:-:-:1 FMNMX x04, out04, RZ, PT; +--:-:-:-:1 FMNMX x08, out08, RZ, PT; +--:-:-:-:1 FMNMX x12, out12, RZ, PT; + +--:-:-:-:1 FFMA out00, x00, param_beta, b00; +--:-:-:-:1 FFMA out04, x04, param_beta, b04; +--:-:-:-:1 FFMA out08, x08, param_beta, b08; +--:-:-:-:1 FFMA out12, x12, param_beta, b12; + } : ''; ++] + + +[+ + our ($beta, $brelu, $bprelu, $convert_in); + return $convert_in && ($beta || $brelu || $bprelu) ? qq{ +10:-:1:-:1 \@P0 $convert_in b00, b00; +--:-:2:-:1 \@P1 $convert_in b04, b04; +20:-:3:-:1 \@P2 $convert_in b08, b08; +--:-:4:-:1 \@P3 $convert_in b12, b12; + } : ''; ++] +[+ + our $beta; return $beta ? q{ +11:-:-:-:1 FFMA out00, b00, param_beta, out00; +02:-:-:-:1 FFMA out04, b04, param_beta, out04; +24:-:-:-:1 FFMA out08, b08, param_beta, out08; +08:-:-:-:1 FFMA out12, b12, param_beta, out12; + } : ''; ++] +[+ + our $brelu; return $brelu ? q{ +//delta *= x > 0 +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +11:-:-:-:1 FSETP.GT.AND P0, PT, b00, RZ, PT; +02:-:-:-:1 FSETP.GT.AND P1, PT, b04, RZ, PT; +24:-:-:-:1 FSETP.GT.AND P2, PT, b08, RZ, PT; +08:-:-:-:1 FSETP.GT.AND P3, PT, b12, RZ, PT; +--:-:-:-:1 @!P0 MOV out00, RZ; +--:-:-:-:1 @!P1 MOV out04, RZ; +--:-:-:-:1 @!P2 MOV out08, RZ; +--:-:-:-:1 @!P3 MOV out12, RZ; +--:-:-:Y:d R2P PR, preds, 0x0f; + + } : ''; ++] +[+ + our $bprelu; return $bprelu ? q{ +//delta *= ((x > 0) + slope * (x < 0)) +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +11:-:-:-:1 FSETP.GT.AND P0, PT, b00, RZ, PT; +02:-:-:-:1 FSETP.GT.AND P1, PT, b04, RZ, PT; +24:-:-:-:1 FSETP.GT.AND P2, PT, b08, RZ, PT; +08:-:-:-:1 FSETP.GT.AND P3, PT, b12, RZ, PT; +--:-:-:-:1 SEL x00, one, RZ, P0; +--:-:-:-:1 SEL x04, one, RZ, P1; +--:-:-:-:1 SEL x08, one, RZ, P2; +--:-:-:-:1 SEL x12, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b00, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b04, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b08, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b12, RZ, PT; +--:-:-:-:1 SEL b00, one, RZ, P0; +--:-:-:-:1 SEL b04, one, RZ, P1; +--:-:-:-:1 SEL b08, one, RZ, P2; +--:-:-:-:1 SEL b12, one, RZ, P3; +--:-:-:-:1 R2P PR, preds, 0x0f; +--:-:-:-:1 FFMA b00, b00, param_beta, x00; +--:-:-:-:1 FFMA b04, b04, param_beta, x04; +--:-:-:-:1 FFMA b08, b08, param_beta, x08; +--:-:-:-:1 FFMA b12, b12, param_beta, x12; +--:-:-:-:1 FMUL out00, out00, b00; +--:-:-:-:1 FMUL out04, out04, b04; +--:-:-:-:1 FMUL out08, out08, b08; +--:-:-:-:2 FMUL out12, out12, b12; + } : ''; ++] +[+ + our $bsum; return $bsum ? q{ +20:-:-:-:1 SEL sum00, out00, RZ, P0; +--:-:-:-:1 SEL sum04, out04, RZ, P1; +--:-:-:-:1 SEL sum08, out08, RZ, P2; +--:-:-:-:1 SEL sum12, out12, RZ, P3; + } : ''; ++] + +[+ + our $convert_out; return $convert_out ? qq{ +--:-:1:-:1 \@P0 $convert_out out00, out00; +--:-:2:-:1 \@P1 $convert_out out04, out04; +--:-:3:-:1 \@P2 $convert_out out08, out08; +--:-:4:-:1 \@P3 $convert_out out12, out12; + } : ''; ++] + + +--:-:-:-:1 LEA Out00_0.CC, offset, param_O[0], [+ dshift() +]; +--:-:-:-:1 LEA.HI.X Out00_1, offset, param_O[1], RZ, [+ dshift() +]; +--:-:-:-:1 IADD Out04_0.CC, Out00_0, MPQN4; +--:-:-:-:1 IADD.X Out04_1, Out00_1, RZ; +--:-:-:-:1 IADD Out08_0.CC, Out04_0, MPQN4; +--:-:-:-:1 IADD.X Out08_1, Out04_1, RZ; +--:-:-:-:1 IADD Out12_0.CC, Out08_0, MPQN4; +--:-:-:-:1 IADD.X Out12_1, Out08_1, RZ; + +01:-:-:-:1 @P0 STG.E.CG.[+ dtype() +] [Out00_0], out00; +02:-:-:-:1 @P1 STG.E.CG.[+ dtype() +] [Out04_0], out04; +04:-:-:-:1 @P2 STG.E.CG.[+ dtype() +] [Out08_0], out08; +08:1:-:-:1 @P3 STG.E.CG.[+ dtype() +] [Out12_0], out12; + + +[+ + our $bsum; return $bsum ? q{ + +--:-:-:-:1 XMAD.LO2C bsum00, k00, param_gridMPQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum04, k04, param_gridMPQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum08, k08, param_gridMPQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum12, k12, param_gridMPQN, bsum_offset; +--:-:-:-:1 LEA Sum00_0.CC, bsum00, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum00_1, bsum00, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum04_0.CC, bsum04, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum04_1, bsum04, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum08_0.CC, bsum08, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum08_1, bsum08, param_Sum[1], RZ, 2; +--:-:-:-:1 LEA Sum12_0.CC, bsum12, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum12_1, bsum12, param_Sum[1], RZ, 2; +--:-:-:-:1 ISETP.LT.AND P0, PT, k00, param_K, P6; // k00 < K && tid31 == 0 +--:-:-:-:1 ISETP.LT.AND P1, PT, k04, param_K, P6; // k04 < K && tid31 == 0 +--:-:-:-:1 ISETP.LT.AND P2, PT, k08, param_K, P6; // k08 < K && tid31 == 0 +--:-:-:-:1 ISETP.LT.AND P3, PT, k12, param_K, P6; // k12 < K && tid31 == 0 + +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 1, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 1, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 1, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 1, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:1 FADD sum12, x12, sum12; +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 2, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 2, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 2, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 2, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:1 FADD sum12, x12, sum12; +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 4, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 4, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 4, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 4, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:1 FADD sum12, x12, sum12; +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 8, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 8, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 8, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 8, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:1 FADD sum12, x12, sum12; +--:-:-:-:1 SHFL.BFLY PT, x00, sum00, 16, 0x1f; +--:-:5:-:1 SHFL.BFLY PT, x04, sum04, 16, 0x1f; +--:-:-:-:1 SHFL.BFLY PT, x08, sum08, 16, 0x1f; +--:-:6:-:1 SHFL.BFLY PT, x12, sum12, 16, 0x1f; +10:-:-:-:1 FADD sum00, x00, sum00; +--:-:-:-:1 FADD sum04, x04, sum04; +20:-:-:-:1 FADD sum08, x08, sum08; +--:-:-:-:0 FADD sum12, x12, sum12; + + +--:-:-:-:1 @P0 STG.E.CG [Sum00_0], sum00; +--:-:-:-:1 @P1 STG.E.CG [Sum04_0], sum04; +--:-:-:-:1 @P2 STG.E.CG [Sum08_0], sum08; +--:6:-:-:1 @P3 STG.E.CG [Sum12_0], sum12; + } : ''; ++] + +--:-:-:-:1 IADD k00, k00, 16; +--:-:-:-:1 IADD k04, k04, 16; +--:-:-:-:1 IADD k08, k08, 16; +--:-:-:-:1 IADD k12, k12, 16; +--:-:-:-:0 IADD offset, offset, MPQN16; + +--:-:-:-:5 RET; \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/xconv_winograd_2x2_3x3_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_2x2_3x3_32x32.sass new file mode 100644 index 0000000..a8a1ef4 --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_winograd_2x2_3x3_32x32.sass @@ -0,0 +1,1568 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $type; +our $dtype = $type eq 'h' ? 'U16' : '32'; +our $dshift = $type eq 'h' ? '1' : '2'; +our $dsize = $type eq 'h' ? '2' : '4'; +our $vsize = $type eq 'h' ? '64' : '128'; + +sub dtype { return $dtype; } +sub dsize { return $dsize; } +sub dshift { return $dshift; } + +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +-] + + + param_S[0] : c[0x0][0x140] + param_S[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_C : c[0x0][0x174] + param_H : c[0x0][0x178] + param_P : c[0x0][0x17c] + param_pad_h : c[0x0][0x180] + param_pad_w : c[0x0][0x184] + param_HWN : c[0x0][0x188] + param_WN : c[0x0][0x18c] + param_PQN : c[0x0][0x190] + param_QN : c[0x0][0x194] + param_Qnk : c[0x0][0x198] + param_nk : c[0x0][0x19c] + param_n : c[0x0][0x1a0] + param_k : c[0x0][0x1a4] + param_magic_Qnk : c[0x0][0x1a8] + param_shift_Qnk : c[0x0][0x1ac] + param_magic_nk : c[0x0][0x1b0] + param_shift_nk : c[0x0][0x1b4] + param_magic_k : c[0x0][0x1b8] + param_shift_k : c[0x0][0x1bc] + param_RSK : c[0x0][0x1c0] + param_4RSKp : c[0x0][0x1c4] + param_4HWNp : c[0x0][0x1c8] + param_gridK : c[0x0][0x1cc] + param_gridP2 : c[0x0][0x1d0] + param_gridQ : c[0x0][0x1d4] + param_gridN : c[0x0][0x1d8] + param_gridQN : c[0x0][0x1dc] + param_gridPQN : c[0x0][0x1e0] + param_superP : c[0x0][0x1e4] + param_superQ : c[0x0][0x1e8] + param_superN : c[0x0][0x1ec] + param_shiftP : c[0x0][0x1f0] + param_shiftQ : c[0x0][0x1f4] + param_shiftN : c[0x0][0x1f8] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ix<0-7>, j0Fy<0-7> + 80-95 : j1Ix<0-7>, j1Fy<0-7> + + 64-79 ~ tid, idx_P, idx_Q, idx_N, idx_K, idx_n, idx_k, tid16, tid31, c, addr_zero, partialC + 80-119 ~ tid1, idx_PQnk, idx_Qnk, idx_nk, magic_Qnk, neg_Qnk, neg_nk, neg_k, div<1-3>, idx_P2, idx_Q2, z<1-2>, negOne, super_P, super_Q + 80-95 ~ super_N, y, x, ti, ti_sign, x<1-3>, mask_x, preds1, offsetIC + 80-95 ~ tf, tid31_4, offsetFC + + 120-121 : track<0-1> + 122-127 ~ writeS, readFs, readIs, C, preds, idx_nkpq + + 80-95 ~ p, q, n, tid32, tid64, tid_16, tid_1, q2, p2, to, superP, superQ, superN + 96-99 : Out<0-1>, Sum<0-1> + 100-121 ~ alpha, one, writeCs, readCs, k, PQN15, tid_31, out_offset, bsum_offset + + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + + 64-79 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3> + 80-95 : t0<0-1>, t1<0-1>, t2<0-1>, t3<0-1> + + 3,2,11,10,19,18,27,26,1,0,9,8,17,16,25,24 ~ b<00|01|10|11>, x<00|01|10|11>, sum<0|1>, s0<0-1>, s1<0-1> + + // Image registers (registers assigned to avoid bank conflicts) + 96 = i00 + 97 = i01 + 98 = i02 + 99 = i03 + 100 = i30 + 101 = i31 + 102 = i32 + 103 = i33 + 105 = i13 + 104 = i12 + 107 = i11 + 106 = i10 + 108 = i23, TI23, I23 + 109 = i22, TI22 + 110 = i21, TI21 + 111 = i20, TI20, I20 + 113 = TI00, I00, TI10, I10, I21, I01 + 112 = TI01, I11 + 115 = TI02, I12 + 114 = TI03, I03, TI11, I31 + 116 = TI30, I30, TI12, I32 + 117 = TI31 + 118 = TI32 + 119 = TI33, I33, TI13, I13, I22, I02 + // Filter registers +[+ + our $FX; + return $FX ? q{ + 104-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3> + } : q{ + 96 = f00, TF00, F00 + 97 = f01, TF01 + 98 = f02, TF02, F03 + 99 = f10 + 100 = f11 + 101 = f12 + 102 = f20, TF30, F30 + 103 = f21, TF31 + 104 = f22, TF32, F33 + 105 = tb3, F32 + 106 = tb0, F02 + 107 = ta2, TF22, F23 + 108 = ta0, TF20, F20 + 109 = ta1, TF21 + 110 = F01 + 111 = F31 + 112 = TF10, F10 + 113 = TF11 + 114 = TF12, F13 + 115 = tb1, F12 + 116 = tb2, F22 + 117 = F11 + 118 = F21 + }; ++] + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_PQnk, SR_CTAID.X; + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 128, PT; + +--:-:-:-:1 LOP.AND tid31, tid, 31; + +// c = (tid & 127) / 32 +--:-:-:-:1 BFE.U32 c, tid, 0x205; // 2 bits at position 5 + +--:-:-:-:1 SHL addr_zero, tid31, 4; +--:-:-:-:1 ISCADD addr_zero, c, addr_zero, 11; +--:-:-:-:1 @P0 IADD addr_zero, addr_zero, 4x<512*4>; + +--:-:-:-:1 STS.128 [addr_zero + 4x<00*4>], RZ; +--:-:-:-:1 STS.128 [addr_zero + 4x<32*4>], RZ; +--:-:-:-:1 STS.128 [addr_zero + 4x<64*4>], RZ; +--:-:-:-:1 STS.128 [addr_zero + 4x<96*4>], RZ; + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + + +// idx_P2 = idx_PQnk / blk_Qnk +--:-:-:-:1 MOV magic_Qnk, param_magic_Qnk; +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_Qnk, 1, PT; +02:-:-:-:1 @P1 XMAD div1, idx_PQnk, magic_Qnk, RZ; +--:-:-:-:1 @P1 XMAD div2, idx_PQnk, magic_Qnk.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, idx_PQnk.H1, magic_Qnk.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk, div1; +--:-:-:-:1 @P1 IADD3.RS idx_P2, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 idx_P2, idx_P2, param_shift_Qnk; +--:-:-:-:1 @!P1 SHR.U32 idx_P2, idx_PQnk, param_shift_Qnk; + +// idx_Qnk = idx_PQnk % blk_Qnk +--:-:-:-:1 IADD neg_Qnk, RZ, -param_Qnk; +--:-:-:-:1 XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk; + +// idx_Q2 = idx_Qnk / nk +--:-:-:-:1 XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ; +--:-:-:-:1 SHR.U32 idx_Q2, idx_Q2, param_shift_nk; +// idx_nk = idx_Qnk % nk +--:-:-:-:1 IADD neg_nk, RZ, -param_nk; +--:-:-:-:1 XMAD.S16.U16 idx_nk, neg_nk, idx_Q2, idx_Qnk; + +// idx_n = idx_nk / k +--:-:-:-:1 XMAD idx_n, idx_nk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_n, idx_n, param_shift_k; +// idx_k = idx_nk % k +--:-:-:-:1 IADD neg_k, RZ, -param_k; +--:-:-:-:1 XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk; + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// idx_P = idx_P2 * 2 +// idx_Q = idx_Q2 +// if idx_P2 != gridP2: +// idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1) +// idx_Q = idx_Q2 >> 1 +--:-:-:-:1 ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT; +--:-:-:-:1 SHL idx_P, idx_P2, 1; +--:-:-:-:1 @P1 LOP.AND z1, idx_Q2, 1; +--:-:-:-:1 @P1 BFE.U32 z2, idx_Q2, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P1 LOP.XOR z1, z1, z2; +--:-:-:-:1 @P1 IADD idx_P, idx_P, z1; +--:-:-:-:1 @P1 SHR.U32 idx_Q, idx_Q2, 1; +--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2; + +// Scan backwards on odd rows +// if idx_P2 & 1: +// idx_Q = gridQ - idx_Q - 1 +--:-:-:-:1 LOP.AND.NZ P2, RZ, idx_P2, 1; +--:-:-:-:1 MOV negOne, -1; +--:-:-:-:1 @P2 IADD3 idx_Q, -idx_Q, param_gridQ, negOne; + +--:-:-:-:1 BFI idx_nkpq, idx_P, 0x0c0c, idx_Q; +--:-:-:-:1 BFI idx_nkpq, idx_k, 0x0418, idx_nkpq; +--:-:-:-:1 BFI idx_nkpq, idx_n, 0x041c, idx_nkpq; + +// x = grid_x << shiftX +// y = grid_y << shiftY +--:-:-:-:1 SHL idx_P, idx_P, param_shiftP; +--:-:-:-:1 SHL idx_Q, idx_Q, param_shiftQ; + +// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp +--:-:-:-:1 BFE.U32 super_P, tid, param_superP; +--:-:-:-:1 BFE.U32 super_Q, tid, param_superQ; +--:-:-:-:1 ISCADD idx_P, super_P, idx_P, 1; +--:-:-:-:1 ISCADD idx_Q, super_Q, idx_Q, 1; + +// If this value is not a multiple of 4 we want to grab the partial amount on the first fetch. +// If it is a multiple of 4 then make a full 4 line fetch. +--:-:-:-:1 MOV C, param_C; +--:-:-:-:1 LOP.AND.Z P6, partialC, C, 3; +--:-:-:-:1 @!P6 IADD3 C, C, 4, -partialC; +--:-:-:-:1 @P6 MOV partialC, 4; +// P6 = c < partialC +--:-:-:-:1 ISETP.LT.AND P6, PT, c, partialC, PT; + +[+ + our $FX; return $FX ? '' : q{ +// writeS = c*512 + tid & 31 +--:-:-:-:1 ISCADD writeS, c, tid31, 9; +--:-:-:-:1 ISCADD writeS, writeS, 4x<512*4*2>, 2; + } ++] + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readFs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readIs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid16; +--:-:-:-:1 SHL readIs, readIs, 4; + +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 LOP3.LUT readFs, readFs, tid16, tid1, 0xfe; +--:-:-:-:1 ISCADD readFs, readFs, 4x<512*4>, 4; + + +--:-:-:-:5 @P0 BRA.U FILTER_SETUP; + +--:-:1:-:2 S2R idx_N, SR_CTAID.Z; + + + + +// writeS = c*512 + tid & 31 +[+ + our $FX; + return $FX ? q{ +--:-:-:-:1 ISCADD writeS, c, tid31, 9; +--:-:-:-:1 ISCADD writeS, writeS, 4x<512*4*2>, 2; + } : ''; ++] + +--:-:-:-:1 LOP.AND super_N, tid, param_superN; + +01:-:-:-:1 XMAD idx_N, idx_N, param_n, idx_n; +--:-:-:-:1 SHL idx_N, idx_N, param_shiftN; +--:-:-:-:1 IADD idx_N, idx_N, super_N; + +// n < N +--:-:-:-:1 ISETP.LT.AND P5, PT, idx_N, 1x<$N>, PT; + +// Subtract off the padding +--:-:-:-:1 IADD y, idx_P, -param_pad_h; +--:-:-:-:1 IADD x, idx_Q, -param_pad_w; + +// a0 = n + x*N + y*XN + c*YXN +--:-:-:-:1 XMAD.S16.U16 ti, x, 1x<$N>, idx_N; +--:-:-:-:1 XMAD.S16.U16.LO2C ti, y, param_WN, ti; +--:-:-:-:1 XMAD.S16.U16.LO2C ti, c, param_HWN, ti; +--:-:-:-:1 ISET.LT.AND ti_sign, ti, RZ, PT; +--:-:-:-:1 LEA track0.CC, ti, param_I[0], [+ dshift() +]; +--:-:-:-:1 IADD.X track1, ti_sign, param_I[1]; + +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 IADD x2, x, 2; +--:-:-:-:1 IADD x3, x, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x, 1x<$W>, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, 1x<$W>, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, 1x<$W>, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, 1x<$W>, PT; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x0f; + +--:-:-:-:1 IADD x1, y, 1; +--:-:-:-:1 IADD x2, y, 2; +--:-:-:-:1 IADD x3, y, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, y, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_H, P5; +--:-:-:-:1 ISETP.GE.AND P0, PT, y, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; + +--:-:-:-:1 SEL preds, mask_x, RZ, P0; +--:-:-:-:1 @P1 BFI preds, mask_x, 0x404, preds; +--:-:-:-:1 @P2 BFI preds, mask_x, 0x408, preds; +--:-:-:-:1 @P3 BFI preds, mask_x, 0x40c, preds; + +// For partial C on first load +--:-:-:-:1 SEL preds1, preds, RZ, P6; + +// offsetIC = partialC*YXN +--:-:-:-:1 XMAD.LO2C offsetIC, partialC, param_HWN, RZ; + +--:-:-:-:1 R2P PR, preds1, 0x0f; +--:-:-:-:1 SHF.R.U64 preds1, preds1, 12, preds1; + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>]; +--:-:-:-:1 @!P0 MOV i00, RZ; +--:-:-:-:1 @!P1 MOV i01, RZ; +--:-:-:-:1 @!P2 MOV i02, RZ; +--:-:-:-:1 @!P3 MOV i03, RZ; + +--:-:-:-:1 R2P PR, preds1, 0x0f; +--:-:-:-:1 SHF.L.U64 preds1, preds1, 8, preds1; + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>]; +--:-:-:-:1 @!P0 MOV i30, RZ; +--:-:-:-:1 @!P1 MOV i31, RZ; +--:-:-:-:1 @!P2 MOV i32, RZ; +--:-:-:-:1 @!P3 MOV i33, RZ; + +--:-:-:-:1 R2P PR, preds1, 0x0f; +--:-:-:-:1 SHF.R.U64 preds1, preds1, 4, preds1; + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>]; +--:-:-:-:1 @!P0 MOV i10, RZ; +--:-:-:-:1 @!P1 MOV i11, RZ; +--:-:-:-:1 @!P2 MOV i12, RZ; +--:-:-:-:1 @!P3 MOV i13, RZ; + +--:-:-:-:1 R2P PR, preds1, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>]; +--:6:2:-:1 @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>]; +--:-:-:-:1 @!P0 MOV i20, RZ; +--:-:-:-:1 @!P1 MOV i21, RZ; +--:-:-:-:1 @!P2 MOV i22, RZ; +--:-:-:-:1 @!P3 MOV i23, RZ; + + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>]; + +20:-:-:-:6 LEA track0.CC, offsetIC, track0, [+ dshift() +]; +--:-:-:-:0 LEA.HI.X track1, offsetIC, track1, RZ, [+ dshift() +]; + +--:-:-:-:5 BRA.U IMAGE_LOOP; + + + +FILTER_SETUP: + +--:-:1:-:2 S2R idx_K, SR_CTAID.Y; + + +01:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +[+ + our ($dtype, $dshift, $FX, $K, $vsize, $dsize); + return $FX ? qq{ + +// writeS = (c*512 + (tid & 31)*4)*4 +--:-:-:-:1 SHL writeS, tid31, 4; +--:-:-:-:1 ISCADD writeS, c, writeS, 11; +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<512*4*2>; + +// (kBlks,C,4,4,32) +// offset = idx_K*C*512 + c*512 + tid31*4; +--:-:-:-:1 SHL tid31_4, tid31, 2; +--:-:-:-:1 XMAD tf, idx_K, param_C, c; +--:-:-:-:1 ISCADD tf, tf, tid31_4, 9; +--:-:-:-:1 LEA track0.CC, tf, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1, tf, param_F[1], RZ, $dshift; + +// offsetFC = partialC*512 +--:-:-:-:1 SHL offsetFC, partialC, 9; + +--:-:-:-:1 \@!P6 LDS.U.$vsize F0, [addr_zero]; +--:-:-:-:1 \@!P6 LDS.U.$vsize F1, [addr_zero]; +--:-:-:-:1 \@!P6 LDS.U.$vsize F2, [addr_zero]; +--:-:-:-:1 \@!P6 LDS.U.$vsize F3, [addr_zero]; + +--:-:2:-:1 \@P6 LDG.E.CG.$vsize F0, [track + 4x<00 * $dsize>]; +--:-:3:-:1 \@P6 LDG.E.CG.$vsize F1, [track + 4x<32 * $dsize>]; +--:-:4:-:1 \@P6 LDG.E.CG.$vsize F2, [track + 4x<64 * $dsize>]; +--:6:5:-:1 \@P6 LDG.E.CG.$vsize F3, [track + 4x<96 * $dsize>]; + + + } : qq{ +// k = idx_K*32 + tid & 31 +--:-:-:-:1 ISCADD idx_K, idx_K, tid31, 5; +--:-:-:-:1 ISETP.LT.AND P0, PT, idx_K, 1x<$K>, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, idx_K, 1x<$K>, PT; + +// offsetFC = partialC * RSK +--:-:-:-:1 XMAD.LO2C offsetFC, partialC, param_RSK, RZ; + +// a0 = k + c*RSK +--:-:-:-:1 XMAD.LO2C tf, c, param_RSK, idx_K; + +--:-:-:-:1 LEA track0.CC, tf, param_F[0], $dshift; +--:-:-:-:1 LEA.HI.X track1, tf, param_F[1], RZ, $dshift; + +--:-:-:-:1 \@!P0 MOV f00, RZ; +--:-:-:-:1 \@!P0 MOV f01, RZ; +--:-:-:-:1 \@!P0 MOV f02, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>]; +--:-:-:-:1 \@!P0 MOV f20, RZ; +--:-:-:-:1 \@!P0 MOV f21, RZ; +--:-:-:-:1 \@!P0 MOV f22, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>]; +--:-:-:-:1 \@!P0 MOV f10, RZ; +--:-:-:-:1 \@!P0 MOV f11, RZ; +--:-:-:-:1 \@!P0 MOV f12, RZ; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>]; +--:-:-:-:1 \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>]; +--:6:2:-:1 \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>]; + }; ++] + + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>]; + +20:-:-:-:6 LEA track0.CC, offsetFC, track0, [+ dshift() +]; +--:-:-:-:0 LEA.HI.X track1, offsetFC, track1, RZ, [+ dshift() +]; + +--:-:-:-:5 BRA.U FILTER_LOOP; + + +IMAGE_LOOP: +--:-:-:-:1 ISETP.GT.AND P6, PT, C, 4, PT; +[+ + our ($dtype, $dsize, $convert_in, $W, $N); + my %insert = ( + + j0c1 => "--:-:-:-:1 ISETP.GT.AND P5, PT, C, RZ, PT;\n" . + "--:-:-:-:1 IADD C, C, -4;\n", + + + j0c14 => "--:-:-:-:1 R2P PR, preds, 0x0f;\n", + j0c16 => "--:-:-:-:1 \@P6 SHF.R.U64 preds, preds, 12, preds;\n", + + $convert_in ? ( + j0c3 => "02:-:-:-:1 $convert_in i00, i00;\n", + j0c5 => "--:-:-:-:1 $convert_in i01, i01;\n", + j0c7 => "--:-:-:-:1 $convert_in i02, i02;\n", + j0c9 => "--:-:-:-:0 \@!P6 MOV preds, RZ;\n" . + "--:-:-:-:1 $convert_in i03, i03;\n", + + j0c11 => "--:-:-:-:1 $convert_in i20, i20;\n", + j0c13 => "--:-:-:-:1 $convert_in i21, i21;\n", + j0c15 => "--:-:-:-:1 $convert_in i22, i22;\n", + j0c17 => "--:-:2:-:1 $convert_in i23, i23;\n", + + j0c19 => "--:-:-:-:1 $convert_in i10, i10;\n", + j0c21 => "--:-:-:-:1 $convert_in i11, i11;\n", + j0c23 => "--:-:-:-:1 $convert_in i12, i12;\n", + j0c25 => "--:-:-:-:1 $convert_in i13, i13;\n", + + j0c27 => "--:-:-:-:1 $convert_in i30, i30;\n", + j0c29 => "--:-:-:-:1 $convert_in i31, i31;\n", + j0c31 => "--:-:-:-:1 $convert_in i32, i32;\n", + j0c33 => "--:-:3:-:1 $convert_in i33, i33;\n", + ) : ( + j0c9 => "--:-:-:-:1 \@!P6 MOV preds, RZ;\n", + ), + + j0c32 => "02:-:-:-:1 \@P5 FADD TI00, i00, -i20;\n" . + "--:-:-:-:1 \@P5 FADD TI01, i01, -i21;\n" . + "--:-:-:-:1 \@P5 FADD TI02, i02, -i22;\n" . + "--:-:-:-:1 \@P5 FADD TI03, i03, -i23;\n", + + j0c35 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n", + j0c37 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n", + j0c39 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype i02, [track + ${dsize}x<0*$W*$N + 2*$N>];\n", + j0c41 => "--:-:-:-:1 \@P3 LDG.E.CI.$dtype i03, [track + ${dsize}x<0*$W*$N + 3*$N>];\n", + j0c43 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i00, RZ;\n", + j0c45 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i01, RZ;\n", + j0c47 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i02, RZ;\n", + j0c49 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i03, RZ;\n" . + "--:-:-:-:1 R2P PR, preds, 0x0f;\n", + + j0c50 => "--:-:-:-:1 \@P6 SHF.L.U64 preds, preds, 8, preds;\n", + + j0c55 => "04:-:-:-:1 \@P5 FADD TI30, i10, -i30;\n" . + "--:-:-:-:1 \@P5 FADD TI31, i11, -i31;\n" . + "--:-:-:-:1 \@P5 FADD TI32, i12, -i32;\n" . + "--:-:-:-:1 \@P5 FADD TI33, i13, -i33;\n", + + j0c57 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype i30, [track + ${dsize}x<3*$W*$N + 0*$N>];\n", + j0c59 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype i31, [track + ${dsize}x<3*$W*$N + 1*$N>];\n", + j0c61 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype i32, [track + ${dsize}x<3*$W*$N + 2*$N>];\n", + j0c63 => "--:-:-:-:1 \@P3 LDG.E.CI.$dtype i33, [track + ${dsize}x<3*$W*$N + 3*$N>];\n", + j1c1 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i30, RZ;\n", + j1c3 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i31, RZ;\n", + j1c5 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i32, RZ;\n", + j1c7 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i33, RZ;\n" . + "--:-:-:-:1 R2P PR, preds, 0x0f;\n" . + "--:-:-:-:1 \@P5 FADD I00, TI00, -TI02;\n" . + "--:-:-:-:1 \@P5 FADD I03, TI01, -TI03;\n" . + "--:-:-:-:1 \@P5 FADD I30, TI30, -TI32;\n" . + "--:-:-:-:1 \@P5 FADD I33, TI31, -TI33;\n" . + "--:-:-:-:1 \@P6 SHF.R.U64 preds, preds, 4, preds;\n", + + j1c9 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*4 + 0)>], I00;\n", + j1c11 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*4 + 3)>], I03;\n", + j1c13 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*4 + 0)>], I30;\n", + j1c15 => "--:3:-:-:1 \@P5 STS [writeS + 4x<32*(3*4 + 3)>], I33;\n", + + + j1c29 => "04:-:-:-:1 \@P5 FADD TI10, i10, i20;\n" . + "--:-:-:-:1 \@P5 FADD TI20, -i10, i20;\n" . + "--:-:-:-:1 \@P5 FADD TI11, i11, i21;\n" . + "--:-:-:-:1 \@P5 FADD TI21, -i11, i21;\n" . + "--:-:-:-:1 \@P5 FADD TI12, i12, i22;\n" . + "--:-:-:-:1 \@P5 FADD TI22, -i12, i22;\n" . + "--:-:-:-:1 \@P5 FADD TI13, i13, i23;\n" . + "--:-:-:-:1 \@P5 FADD TI23, -i13, i23;\n", + + j1c30 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n", + j1c32 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n", + j1c34 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype i12, [track + ${dsize}x<1*$W*$N + 2*$N>];\n", + j1c36 => "--:-:-:-:1 \@P3 LDG.E.CI.$dtype i13, [track + ${dsize}x<1*$W*$N + 3*$N>];\n", + j1c38 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i10, RZ;\n", + j1c40 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i11, RZ;\n", + j1c42 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i12, RZ;\n", + j1c44 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i13, RZ;\n" . + "--:-:-:-:1 R2P PR, preds, 0x0f;\n" . + "--:-:-:-:1 \@P5 FADD I10, TI10, -TI12;\n" . + "--:-:-:-:1 \@P5 FADD I20, TI20, -TI22;\n" . + "--:-:-:-:1 \@P5 FADD I13, TI11, -TI13;\n" . + "--:-:-:-:1 \@P5 FADD I23, TI21, -TI23;\n" . + "--:-:-:-:1 \@P6 SHF.L.U64 preds, preds, 8, preds;\n", + + j1c46 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*4 + 0)>], I10;\n", + j1c48 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*4 + 0)>], I20;\n", + j1c50 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*4 + 3)>], I13;\n", + j1c52 => "--:3:-:-:1 \@P5 STS [writeS + 4x<32*(2*4 + 3)>], I23;\n", + + + j2c8 => "04:-:-:-:1 \@P5 FADD I21, TI21, TI22;\n" . + "--:-:-:-:1 \@P5 FADD I22, -TI21, TI22;\n", + + j2c11 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*4 + 1)>], I21;\n", + j2c13 => "--:3:-:-:1 \@P5 STS [writeS + 4x<32*(2*4 + 2)>], I22;\n", + + j2c15 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype i20, [track + ${dsize}x<2*$W*$N + 0*$N>];\n", + j2c17 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype i21, [track + ${dsize}x<2*$W*$N + 1*$N>];\n", + j2c19 => "--:-:-:-:1 \@P2 LDG.E.CI.$dtype i22, [track + ${dsize}x<2*$W*$N + 2*$N>];\n", + j2c21 => "--:6:2:-:1 \@P3 LDG.E.CI.$dtype i23, [track + ${dsize}x<2*$W*$N + 3*$N>];\n", + j2c23 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i20, RZ;\n", + j2c25 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i21, RZ;\n", + j2c27 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i22, RZ;\n", + j2c29 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i23, RZ;\n", + + j2c30 => "04:-:-:-:1 \@P5 FADD I01, TI01, TI02;\n" . + "--:-:-:-:1 \@P5 FADD I02, -TI01, TI02;\n" . + "--:-:-:-:1 \@P5 FADD I11, TI11, TI12;\n" . + "--:-:-:-:1 \@P5 FADD I12, -TI11, TI12;\n" . + "--:-:-:-:1 \@P5 FADD I31, TI31, TI32;\n" . + "--:-:-:-:1 \@P5 FADD I32, -TI31, TI32;\n", + + j2c31 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*4 + 1)>], I01;\n", + j2c33 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*4 + 2)>], I02;\n", + j2c35 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*4 + 1)>], I11;\n", + j2c37 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*4 + 2)>], I12;\n", + j2c39 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*4 + 1)>], I31;\n", + j2c41 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*4 + 2)>], I32;\n", + + j2c62 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P5 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" . + "--:-:-:-:1 \@P5 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" . + "--:-:-:-:1 \@P5 LOP.XOR writeS, writeS, 4x<512*4*2>;\n", + + j3c57 => "20:-:-:-:1 \@P6 IADD track0.CC, track0, param_4HWNp;\n", + j3c62 => "--:-:-:-:1 \@P6 IADD.X track1, track1, RZ;\n", + + j3c63 => "--:-:-:Y:5 \@P5 BRA.U IMAGE_LOOP;\n" . + "--:-:-:Y:5 BRA.U END_LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 3) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 4; + my $rsPred = $j == 3 ? '@P5' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +FILTER_LOOP: +--:-:-:-:1 ISETP.GT.AND P0, PT, C, RZ, PT; +[+ + our ($dtype, $convert_in, $FX, $vsize, $dsize, $K); + my %insert = ( + + j0c1 => "--:-:-:-:1 ISETP.GT.AND P1, PT, C, 4, PT;\n" . + "--:-:-:-:1 IADD C, C, -4;\n", + + $FX ? ( + $convert_in ? ( + j1c8 => "02:-:-:-:1 \@P0 $convert_in F03, F01.H1;\n", + j1c12 => "--:-:-:-:1 \@P0 $convert_in F02, F01.H0;\n", + j1c16 => "--:-:-:-:1 \@P0 $convert_in F01, F00.H1;\n", + j1c20 => "--:-:2:-:1 \@P0 $convert_in F00, F00.H0;\n", + + j1c26 => "04:-:-:-:1 \@P0 $convert_in F13, F11.H1;\n", + j1c30 => "--:-:-:-:1 \@P0 $convert_in F12, F11.H0;\n", + j1c34 => "--:-:-:-:1 \@P0 $convert_in F11, F10.H1;\n", + j1c38 => "--:-:3:-:1 \@P0 $convert_in F10, F10.H0;\n", + + j2c8 => "08:-:-:-:1 \@P0 $convert_in F23, F21.H1;\n", + j2c12 => "--:-:-:-:1 \@P0 $convert_in F22, F21.H0;\n", + j2c16 => "--:-:-:-:1 \@P0 $convert_in F21, F20.H1;\n", + j2c20 => "--:-:4:-:1 \@P0 $convert_in F20, F20.H0;\n", + + j2c26 => "10:-:-:-:1 \@P0 $convert_in F33, F31.H1;\n", + j2c30 => "--:-:-:-:1 \@P0 $convert_in F32, F31.H0;\n", + j2c34 => "--:-:-:-:1 \@P0 $convert_in F31, F30.H1;\n", + j2c38 => "--:6:5:-:1 \@P0 $convert_in F30, F30.H0;\n", + ) : (), + + j1c22 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<512*4 + 00*4>], F0;\n", + j1c24 => "02:-:2:-:1 \@P1 LDG.E.CG.$vsize F0, [track0 + 4x<00 * $dsize>];\n", + + j1c40 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<512*4 + 32*4>], F1;\n", + j1c42 => "04:-:3:-:1 \@P1 LDG.E.CG.$vsize F1, [track0 + 4x<32 * $dsize>];\n", + + j2c22 => "08:4:-:-:1 \@P0 STS.128 [writeS + 4x<512*4 + 64*4>], F2;\n", + j2c24 => "08:-:4:-:1 \@P1 LDG.E.CG.$vsize F2, [track0 + 4x<64 * $dsize>];\n", + + j2c40 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<512*4 + 96*4>], F3;\n", + j2c42 => "10:6:5:-:1 \@P1 LDG.E.CG.$vsize F3, [track0 + 4x<96 * $dsize>];\n", + + j3c57 => "20:-:-:-:1 \@P1 IADD track0.CC, track0, 4x<32*16 * $dsize>;\n", + j3c62 => "--:-:-:-:1 \@P1 IADD.X track1, track1, RZ;\n", + + ) : ( + $convert_in ? ( + j0c5 => "02:-:-:-:1 $convert_in f00, f00;\n", + j0c7 => "--:-:-:-:1 $convert_in f01, f01;\n", + j0c9 => "--:-:-:-:1 $convert_in f02, f02;\n", + + j0c11 => "--:-:-:-:1 $convert_in f20, f20;\n", + j0c13 => "--:-:-:-:1 $convert_in f21, f21;\n", + j0c15 => "--:-:2:-:1 $convert_in f22, f22;\n", + + j0c17 => "--:-:-:-:1 $convert_in f10, f10;\n", + j0c19 => "--:-:-:-:1 $convert_in f11, f11;\n", + j0c21 => "--:-:4:-:1 $convert_in f12, f12;\n", + ) : (), + + j0c33 => "02:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 0)>], F00;\n", + j0c35 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 3)>], F03;\n", + j0c37 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 0)>], F30;\n", + j0c39 => "--:3:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 3)>], F33;\n", + + j0c40 => "--:-:-:-:1 \@P0 FADD tb0, TF00, TF02;\n" . + "--:-:-:-:1 \@P0 FADD tb3, TF30, TF32;\n" . + "--:-:-:-:1 \@P0 FADD ta0, f00, f20;\n" . + "--:-:-:-:1 \@P0 FADD ta1, f01, f21;\n" . + "--:-:-:-:1 \@P0 FADD ta2, f02, f22;\n", + + j0c41 => "--:-:-:-:1 \@P0 FMUL tb0, tb0, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL tb3, tb3, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL ta0, ta0, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL ta1, ta1, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL ta2, ta2, 0.5;\n", + + j0c42 => "--:-:-:-:1 \@P0 FFMA F01, TF01, 0.5, tb0;\n" . + "--:-:-:-:1 \@P0 FFMA F02, TF01, -0.5, tb0;\n" . + "--:-:-:-:1 \@P0 FFMA F31, TF31, 0.5, tb3;\n" . + "--:-:-:-:1 \@P0 FFMA F32, TF31, -0.5, tb3;\n", + + j0c45 => "04:-:-:-:1 \@P1 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n", + j0c47 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n", + j0c49 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n", + + j0c51 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n", + j0c53 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n", + j0c55 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n", + + j1c8 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 1)>], F01;\n", + j1c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 2)>], F02;\n", + j1c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 1)>], F31;\n", + j1c14 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 2)>], F32;\n", + + j1c15 => "08:-:-:-:1 \@P0 FFMA TF10, f10, 0.5, ta0;\n" . + "--:-:-:-:1 \@P0 FFMA TF20, f10, -0.5, ta0;\n" . + "--:-:-:-:1 \@P0 FFMA TF11, f11, 0.5, ta1;\n" . + "--:-:-:-:1 \@P0 FFMA TF21, f11, -0.5, ta1;\n" . + "--:-:-:-:1 \@P0 FFMA TF12, f12, 0.5, ta2;\n" . + "--:-:-:-:1 \@P0 FFMA TF22, f12, -0.5, ta2;\n", + + j1c16 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n", + j1c18 => "--:-:-:-:1 \@P1 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n", + j1c20 => "--:6:2:-:1 \@P1 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n", + + j1c22 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 0)>], F10;\n", + j1c24 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 0)>], F20;\n", + j1c26 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 3)>], F13;\n", + j1c28 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 3)>], F23;\n", + + j1c29 => "--:-:-:-:1 \@P0 FADD tb1, TF10, TF12;\n" . + "--:-:-:-:1 \@P0 FADD tb2, TF20, TF22;\n", + + j1c34 => "--:-:-:-:1 \@P0 FMUL tb1, tb1, 0.5;\n" . + "--:-:-:-:1 \@P0 FMUL tb2, tb2, 0.5;\n", + + j1c39 => "--:-:-:-:1 \@P0 FFMA F11, TF11, 0.5, tb1;\n" . + "--:-:-:-:1 \@P0 FFMA F12, TF11, -0.5, tb1;\n" . + "--:-:-:-:1 \@P0 FFMA F21, TF21, 0.5, tb2;\n" . + "--:-:-:-:1 \@P0 FFMA F22, TF21, -0.5, tb2;\n", + + j2c8 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 1)>], F11;\n", + j2c10 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 2)>], F12;\n", + j2c12 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 1)>], F21;\n", + j2c14 => "--:-:-:-:1 \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 2)>], F22;\n", + + + j3c57 => "20:-:-:-:1 \@P1 IADD track0.CC, track0, param_4RSKp;\n", + j3c62 => "--:-:-:-:1 \@P1 IADD.X track1, track1, RZ;\n", + ), + + j2c62 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<512*4*2>;\n", + + j3c63 => "--:-:-:Y:5 \@P0 BRA.U FILTER_LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 3) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 4; + my $rsPred = $j == 3 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +END_LOOP: +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R idx_N, SR_CTAID.Z; +--:-:3:-:1 S2R idx_K, SR_CTAID.Y; + + +01:-:-:-:1 LOP.AND tid_31, tid, 31; + +--:-:-:-:1 BFE idx_n, idx_nkpq, 0x041c; +--:-:-:-:1 BFE idx_k, idx_nkpq, 0x0418; +--:-:-:-:1 BFE idx_P, idx_nkpq, 0x0c0c; +--:-:-:-:1 BFE idx_Q, idx_nkpq, 0x0c00; + +02:-:-:-:1 XMAD idx_N, idx_N, param_n, idx_n; +04:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +[+ + our $bsum; return $bsum ? q{ +--:-:-:-:1 XMAD bsum_offset, idx_Q, param_gridN, idx_N; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_P, param_gridQN, bsum_offset; + } : ''; ++] + +// x = grid_x << shiftX +// y = grid_y << shiftY +--:-:-:-:1 SHL p, idx_P, param_shiftP; +--:-:-:-:1 SHL q, idx_Q, param_shiftQ; + +// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp +--:-:-:-:1 BFE.U32 superP, tid, param_superP; +--:-:-:-:1 BFE.U32 superQ, tid, param_superQ; +--:-:-:-:1 ISCADD p, superP, p, 1; +--:-:-:-:1 ISCADD q, superQ, q, 1; + + +--:-:-:-:1 LOP.AND superN, tid, param_superN; +--:-:-:-:1 SHL n, idx_N, param_shiftN; +--:-:-:-:1 IADD n, n, superN; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV32I one, 1.0; + +// readFs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid_1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 LOP.OR readFs, readFs, tid_1; +//--:-:-:-:1 SHL readFs, readFs, 3; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2) +--:-:-:-:1 LOP.AND tid_16, tid, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; +--:-:-:-:1 BFE.U32 readIs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid_16; +--:-:-:-:1 ISCADD readIs, readFs, readIs, 2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// writeCs = readFs * 512 + readIs; +--:-:-:-:1 ISCADD writeCs, readFs, readIs, 12; + +// readCs = tid32 * 512 + tid_31 + tid_64 * 16 +--:-:-:-:1 SHR.U32 tid32, tid, 5; +--:-:-:-:1 SHR.U32 tid64, tid, 6; +--:-:-:-:1 ISCADD readCs, tid32, tid_31, 9; +--:-:-:-:1 ISCADD readCs, tid64, readCs, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// k = idx_K*32 + tid32<<1 +--:-:-:-:1 SHL tid32, tid32, 1; +--:-:-:-:1 ISCADD k, idx_K, tid32, 5; + +// Out00 = k*PQN + p*QN + q*N + n +// Out01 = Out00 + N +// Out10 = Out00 + QN +// Out11 = Out01 + QN +--:-:-:-:1 XMAD out_offset, q, 1x<$N>, n; +--:-:-:-:1 XMAD.LO2C out_offset, p, param_QN, out_offset; +--:-:-:-:1 XMAD.LO2C out_offset, k, param_PQN, out_offset; + + +--:-:-:-:1 MOV PQN15, param_PQN; +--:-:-:-:1 SHL PQN15, PQN15, 4; +--:-:-:-:1 IADD PQN15, PQN15, -param_PQN; + +--:-:-:-:1 IADD q2, q, 1; +--:-:-:-:1 IADD p2, p, 1; + + +--:-:-:-:1 ISETP.EQ.AND P6, PT, RZ, param_flags, PT; // ! no-op +--:-:-:-:1 ISETP.LT.AND P6, PT, n, 1x<$N>, P6; // n < N +--:-:-:-:1 ISETP.LT.AND P2, PT, p, param_P, PT; // p0 < P && n < N +--:-:-:-:1 ISETP.LT.AND P3, PT, q, 1x<$Q>, PT; // q0 < Q && n < N +--:-:-:-:1 ISETP.LT.AND P4, PT, p2, param_P, PT; // p1 < P && n < N +--:-:-:-:1 ISETP.LT.AND P5, PT, q2, 1x<$Q>, PT; // q1 < Q && n < N + +--:-:-:-:1 PSETP.AND.AND P0, PT, P2, P3, P6; // p0 && q0 +--:-:-:-:1 PSETP.AND.AND P1, PT, P2, P5, P6; // p0 && q1 +--:-:-:-:1 PSETP.AND.AND P2, PT, P4, P3, P6; // p1 && q0 +--:-:-:-:1 PSETP.AND.AND P3, PT, P4, P5, P6; // p1 && q1 +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +--:-:-:-:1 ISETP.EQ.AND P6, PT, tid_31, RZ, PT; // tid31 == 0 + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, cx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, cx7y2, alpha; + +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 1; +--:-:-:-:0 IADD out_offset, out_offset, param_PQN; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 15; +--:-:-:-:0 IADD out_offset, out_offset, PQN15; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y4, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y6, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 1; +--:-:-:-:0 IADD out_offset, out_offset, param_PQN; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + + + +11:-:-:-:1 ISETP.LT.AND P4, PT, k, 1x<$K>, PT; // k < K +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; +[+ + our ($beta, $brelu, $bprelu, $dsize, $dshift, $dtype, $Q, $N); + return $beta || $brelu || $bprelu ? qq{ +--:-:-:-:1 LEA Out0.CC, out_offset, param_X[0], $dshift; +--:-:-:-:1 LEA.HI.X Out1, out_offset, param_X[1], RZ, $dshift; + +--:-:-:-:1 \@!P0 MOV b00, RZ; +--:-:-:-:1 \@!P1 MOV b01, RZ; +--:-:-:-:1 \@!P2 MOV b10, RZ; +--:-:-:-:1 \@!P3 MOV b11, RZ; + +--:-:-:-:1 \@P0 LDG.E.CI.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>]; +--:-:5:-:1 \@P1 LDG.E.CI.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CI.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>]; +--:-:6:-:1 \@P3 LDG.E.CI.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>]; + + } : ''; ++] +[+ + our $bias; return $bias ? q{ +// sum = S + k +20:-:-:-:1 LEA Sum0.CC, k, param_S[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, k, param_S[1], RZ, 2; + +--:-:-:-:1 @!P4 MOV b00, RZ; +--:-:5:-:1 @P4 LDG.E.CI b00, [Sum]; + } : ''; ++] + + +--:-:-:-:1 LDS m00, [readCs + 4x< 0*32>]; +--:-:-:-:1 LDS m01, [readCs + 4x< 1*32>]; +--:-:-:-:1 LDS m02, [readCs + 4x< 2*32>]; +--:-:1:Y:1 LDS m03, [readCs + 4x< 3*32>]; + +--:-:-:-:1 LDS m10, [readCs + 4x< 4*32>]; +--:-:-:-:1 LDS m11, [readCs + 4x< 5*32>]; +--:-:-:-:1 LDS m12, [readCs + 4x< 6*32>]; +--:-:2:Y:1 LDS m13, [readCs + 4x< 7*32>]; + +--:-:-:-:1 LDS m20, [readCs + 4x< 8*32>]; +--:-:-:-:1 LDS m21, [readCs + 4x< 9*32>]; +--:-:-:-:1 LDS m22, [readCs + 4x<10*32>]; +--:-:3:Y:1 LDS m23, [readCs + 4x<11*32>]; + +--:-:-:-:1 LDS m30, [readCs + 4x<12*32>]; +--:-:-:-:1 LDS m31, [readCs + 4x<13*32>]; +--:-:-:-:1 LDS m32, [readCs + 4x<14*32>]; +--:-:4:Y:1 LDS m33, [readCs + 4x<15*32>]; + + +// t00 = m00+m01+m02; +// t01 = m01-m02-m03; +01:-:-:-:1 FADD t00, m00, m01; +--:-:-:-:1 FADD t00, t00, m02; +--:-:-:-:1 FADD t01, m01, -m02; +--:-:-:-:1 FADD t01, t01, -m03; +// t10 = m10+m11+m12; +// t11 = m11-m12-m13; +02:-:-:-:1 FADD t10, m10, m11; +--:-:-:-:1 FADD t10, t10, m12; +--:-:-:-:1 FADD t11, m11, -m12; +--:-:-:-:1 FADD t11, t11, -m13; +// t20 = m20+m21+m22; +// t21 = m21-m22-m23; +04:-:-:-:1 FADD t20, m20, m21; +--:-:-:-:1 FADD t20, t20, m22; +--:-:-:-:1 FADD t21, m21, -m22; +--:-:-:-:1 FADD t21, t21, -m23; +// t30 = m30+m31+m32; +// t31 = m31-m32-m33; +08:-:-:-:1 FADD t30, m30, m31; +--:-:-:-:1 FADD t30, t30, m32; +--:-:-:-:1 FADD t31, m31, -m32; +--:-:-:-:1 FADD t31, t31, -m33; +// y00 = t00+t10+t20; +// y01 = t01+t11+t21; +--:-:-:-:1 FADD s00, t00, t10; +--:-:-:-:1 FADD s00, s00, t20; +--:-:-:-:1 FADD s01, t01, t11; +--:-:-:-:1 FADD s01, s01, t21; +// y10 = t10-t20-t30; +// y11 = t11-t21-t31; +--:-:-:-:1 FADD s10, t10, -t20; +--:-:-:-:1 FADD s10, s10, -t30; +--:-:-:-:1 FADD s11, t11, -t21; +--:-:-:-:3 FADD s11, s11, -t31; + +[+ + our $bias; return $bias ? q{ +10:-:-:-:1 @P0 FADD s00, s00, b00; +--:-:-:-:1 @P1 FADD s01, s01, b00; +--:-:-:-:1 @P2 FADD s10, s10, b00; +--:-:-:-:1 @P3 FADD s11, s11, b00; + } : ''; ++] +[+ + our $relu; return $relu ? q{ +// maximum(x, 0) + slope * minimum(0, x) +--:-:-:-:1 @P0 FMNMX s00, s00, RZ, !PT; +--:-:-:-:1 @P1 FMNMX s01, s01, RZ, !PT; +--:-:-:-:1 @P2 FMNMX s10, s10, RZ, !PT; +--:-:-:-:1 @P3 FMNMX s11, s11, RZ, !PT; + } : ''; ++] +[+ + our $prelu; return $prelu ? q{ +// maximum(x, 0) + slope * minimum(0, x) +--:-:-:-:1 @P0 FMNMX b00, s00, RZ, !PT; +--:-:-:-:1 @P1 FMNMX b01, s01, RZ, !PT; +--:-:-:-:1 @P2 FMNMX b10, s10, RZ, !PT; +--:-:-:-:1 @P3 FMNMX b11, s11, RZ, !PT; + +--:-:-:-:1 @P0 FMNMX x00, s00, RZ, PT; +--:-:-:-:1 @P1 FMNMX x01, s01, RZ, PT; +--:-:-:-:1 @P2 FMNMX x10, s10, RZ, PT; +--:-:-:-:1 @P3 FMNMX x11, s11, RZ, PT; + +--:-:-:-:1 @P0 FFMA s00, x00, param_beta, b00; +--:-:-:-:1 @P1 FFMA s01, x01, param_beta, b01; +--:-:-:-:1 @P2 FFMA s10, x10, param_beta, b10; +--:-:-:-:1 @P3 FFMA s11, x11, param_beta, b11; + } : ''; ++] + + + +[+ + our ($beta, $brelu, $bprelu, $convert_in); + return $convert_in && ($beta || $brelu || $bprelu) ? qq{ +10:-:1:-:1 \@P0 $convert_in b00, b00; +--:-:2:-:1 \@P1 $convert_in b01, b01; +20:-:3:-:1 \@P2 $convert_in b10, b10; +--:-:4:-:1 \@P3 $convert_in b11, b11; + } : ''; ++] +[+ + our $beta; return $beta ? q{ +11:-:-:-:1 @P0 FFMA s00, b00, param_beta, s00; +02:-:-:-:1 @P1 FFMA s01, b01, param_beta, s01; +24:-:-:-:1 @P2 FFMA s10, b10, param_beta, s10; +08:-:-:-:1 @P3 FFMA s11, b11, param_beta, s11; + } : ''; ++] +[+ + our $brelu; return $brelu ? q{ +//delta *= x > 0 +11:-:-:-:1 FSETP.GT.AND P0, PT, b00, RZ, PT; +02:-:-:-:1 FSETP.GT.AND P1, PT, b01, RZ, PT; +24:-:-:-:1 FSETP.GT.AND P2, PT, b10, RZ, PT; +08:-:-:-:1 FSETP.GT.AND P3, PT, b11, RZ, PT; +--:-:-:-:1 @!P0 MOV s00, RZ; +--:-:-:-:1 @!P1 MOV s01, RZ; +--:-:-:-:1 @!P2 MOV s10, RZ; +--:-:-:-:1 @!P3 MOV s11, RZ; +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:5 @!P4 R2P PR, RZ, 0x0f; + } : ''; ++] +[+ + our $bprelu; return $bprelu ? q{ +//delta *= ((x > 0) + slope * (x < 0)) +11:-:-:-:1 FSETP.GT.AND P0, PT, b00, RZ, PT; +02:-:-:-:1 FSETP.GT.AND P1, PT, b01, RZ, PT; +24:-:-:-:1 FSETP.GT.AND P2, PT, b10, RZ, PT; +08:-:-:-:1 FSETP.GT.AND P3, PT, b11, RZ, PT; +--:-:-:-:1 SEL x00, one, RZ, P0; +--:-:-:-:1 SEL x01, one, RZ, P1; +--:-:-:-:1 SEL x10, one, RZ, P2; +--:-:-:-:1 SEL x11, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b00, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b01, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b10, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b11, RZ, PT; +--:-:-:-:1 SEL b00, one, RZ, P0; +--:-:-:-:1 SEL b01, one, RZ, P1; +--:-:-:-:1 SEL b10, one, RZ, P2; +--:-:-:-:1 SEL b11, one, RZ, P3; +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; +--:-:-:-:1 FFMA b00, b00, param_beta, x00; +--:-:-:-:1 FFMA b01, b01, param_beta, x01; +--:-:-:-:1 FFMA b10, b10, param_beta, x10; +--:-:-:-:1 FFMA b11, b11, param_beta, x11; +--:-:-:-:1 FMUL s00, s00, b00; +--:-:-:-:1 FMUL s01, s01, b01; +--:-:-:-:1 FMUL s10, s10, b10; +--:-:-:-:1 FMUL s11, s11, b11; + } : ''; ++] +[+ + our $bsum; return $bsum ? q{ +--:-:-:-:1 MOV sum0, RZ; +--:-:-:-:1 @P0 FADD sum0, s00, sum0; +--:-:-:-:1 @P1 FADD sum0, s01, sum0; +--:-:-:-:1 @P2 FADD sum0, s10, sum0; +--:-:-:-:1 @P3 FADD sum0, s11, sum0; + } : ''; ++] + + +[+ + our $convert_out; + return $convert_out ? qq{ +--:-:1:-:1 $convert_out s00, s00; +--:-:2:-:1 $convert_out s01, s01; +--:-:3:-:1 $convert_out s10, s10; +--:-:4:-:1 $convert_out s11, s11; + } : ''; ++] + + + +--:-:-:-:1 LEA Out0.CC, out_offset, param_O[0], [+ dshift() +]; +--:-:-:-:1 LEA.HI.X Out1, out_offset, param_O[1], RZ, [+ dshift() +]; + +// k < K && R2P && output +01:-:-:-:1 @P0 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 0*$N>], s00; +02:-:-:-:1 @P1 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 1*$N>], s01; +04:-:-:-:1 @P2 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 0*$N>], s10; +08:1:-:-:1 @P3 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 1*$N>], s11; + + +[+ + our $bsum; + return $bsum ? q{ + +--:-:-:-:1 XMAD.LO2C b00, k, param_gridPQN, bsum_offset; + +--:-:-:-:1 LEA Sum0.CC, b00, param_S[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, b00, param_S[1], RZ, 2; + +--:-:-:-:1 PSETP.AND.AND P5, PT, P4, P6, PT; // k < K && tid31 == 0 + +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 1, 0x1f; +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 2, 0x1f; +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 4, 0x1f; +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 8, 0x1f; +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 16, 0x1f; +10:-:-:-:2 FADD sum0, sum1, sum0; + +--:5:-:-:1 @P5 STG.E.CG [Sum], sum0; + + } : ''; ++] + +--:-:-:-:5 RET; + + + +// T0 = np.empty((4,4)) +// T1 = np.empty((4,4)) +// +// for O, I in ((T0, I), (T1, T0.T)): +// +// O[0,:] = I[0,:] - I[2,:] +// O[1,:] = I[1,:] + I[2,:] +// O[2,:] = I[2,:] - I[1,:] +// O[3,:] = I[1,:] - I[3,:] +// +// Iw[:] = T1.T +// +// 0 = i00 +// 1 = i01 +// 2 = i02 +// 3 = i03 +// 4 = i30 +// 5 = i31 +// 6 = i32 +// 7 = i33 +// 8 = i13 +// 9 = i12 +// 10 = i11 +// 11 = i10 +// 12 = i23, TI23, I23 +// 13 = i22, TI22 +// 14 = i21, TI21 +// 15 = i20, TI20, I20 +// 16 = TI00, I00, TI10, I10, I21, I01 +// 17 = TI01, I11 +// 18 = TI02, I12 +// 19 = TI03, I03, TI11, I31 +// 20 = TI30, I30, TI12, I32 +// 21 = TI31 +// 22 = TI32 +// 23 = TI33, I33, TI13, I13, I22, I02 +// +// +// TI00 = i00 - i20 +// TI01 = i01 - i21 +// TI02 = i02 - i22 +// TI03 = i03 - i23 +// # load 0 +// +// TI30 = i10 - i30 +// TI31 = i11 - i31 +// TI32 = i12 - i32 +// TI33 = i13 - i33 +// # load 3 +// +// I00 = TI00 - TI02 +// I03 = TI01 - TI03 +// I30 = TI30 - TI32 +// I33 = TI31 - TI33 +// # store 0 +// +// # wait 0 +// TI10 = i10 + i20 +// TI11 = i11 + i21 +// TI12 = i12 + i22 +// TI13 = i13 + i23 +// +// TI20 = i20 - i10 +// TI21 = i21 - i11 +// TI22 = i22 - i12 +// TI23 = i23 - i13 +// +// #load 1 +// +// I10 = TI10 - TI12 +// I20 = TI20 - TI22 +// I13 = TI11 - TI13 +// I23 = TI21 - TI23 +// # store 1 +// +// # wait 1 +// I21 = TI21 + TI22 +// I22 = TI22 - TI21 +// # store 2 +// +// # load 2 +// +// # wait 2 +// I01 = TI01 + TI02 +// I02 = TI02 - TI01 +// I11 = TI11 + TI12 +// I12 = TI12 - TI11 +// I31 = TI31 + TI32 +// I32 = TI32 - TI31 +// #store 3 + + + +// T0 = np.empty((4,3)) +// T1 = np.empty((4,4)) +// +// for O, I in ((T0, F), (T1, T0.T)): +// +// t0 = (I[0,:] + I[2,:])*0.5 +// +// O[0,:] = I[0,:] +// O[1,:] = t0 + I[1,:]*0.5 +// O[2,:] = t0 - I[1,:]*0.5 +// O[3,:] = I[2,:] +// +// Fw[:] = T1.T +// +// 0 = f00, TF00, F00 +// 1 = f01, TF01 +// 2 = f02, TF02, F03 +// 3 = f10 +// 4 = f11 +// 5 = f12 +// 6 = f20, TF30, F30 +// 7 = f21, TF31 +// 8 = f22, TF32, F33 +// 9 = tb3, F32 +// 10 = tb0, F02 +// 11 = ta2, TF22, F23 +// 12 = ta0, TF20, F20 +// 13 = ta1, TF21 +// 14 = F01 +// 15 = F31 +// 16 = TF10, F10 +// 17 = TF11 +// 18 = TF12, F13 +// 19 = tb1, F12 +// 20 = tb2, F22 +// 21 = F11 +// 22 = F21 +// 23 = +// +// +// TF00 = f00 +// TF01 = f01 +// TF02 = f02 +// TF30 = f20 +// TF31 = f21 +// TF32 = f22 +// +// F00 = TF00 +// F03 = TF02 +// F30 = TF30 +// F33 = TF32 +// +// # store 0 +// +// tb0 = TF00 + TF02 +// tb3 = TF30 + TF32 +// ta0 = f00 + f20 +// ta1 = f01 + f21 +// ta2 = f02 + f22 +// +// tb0 = tb0 * 0.5 +// tb3 = tb3 * 0.5 +// ta0 = ta0 * 0.5 +// ta1 = ta1 * 0.5 +// ta2 = ta2 * 0.5 +// +// F01 = tb0 + TF01*0.5 +// F02 = tb0 - TF01*0.5 +// F31 = tb3 + TF31*0.5 +// F32 = tb3 - TF31*0.5 +// +// # wait 0 +// # load 0, 2 +// # store 1 +// +// TF10 = ta0 + f10*0.5 +// TF20 = ta0 - f10*0.5 +// TF11 = ta1 + f11*0.5 +// TF21 = ta1 - f11*0.5 +// TF12 = ta2 + f12*0.5 +// TF22 = ta2 - f12*0.5 +// +// # load 1 +// +// F10 = TF10 +// F20 = TF20 +// F13 = TF12 +// F23 = TF22 +// +// # store 2 +// +// tb1 = TF10 + TF12 +// tb2 = TF20 + TF22 +// tb1 = tb1 * 0.5 +// tb2 = tb2 * 0.5 +// +// F11 = tb1 + TF11*0.5 +// F12 = tb1 - TF11*0.5 +// F21 = tb2 + TF21*0.5 +// F22 = tb2 - TF21*0.5 +// +// # store 3// \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/xconv_winograd_2x2_5x5_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_2x2_5x5_32x32.sass new file mode 100644 index 0000000..0fcb767 --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_winograd_2x2_5x5_32x32.sass @@ -0,0 +1,1589 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $type; +our $dtype = $type eq 'h' ? 'U16' : '32'; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $dshift = $type eq 'h' ? '1' : '2'; +our $dsize = $type eq 'h' ? '2' : '4'; +our $vsize = $type eq 'h' ? '64' : '128'; +sub dtype { return $dtype; } +sub dsize { return $dsize; } +sub dshift { return $dshift; } +sub vsize { return $vsize; } +-] + + + + addr_zero : 4x<32*36*2*4 + 64 + 0> + addr_idx_Y : 4x<32*36*2*4 + 64 + 4> + addr_idx_X : 4x<32*36*2*4 + 64 + 5> + addr_idx_K : 4x<32*36*2*4 + 64 + 6> + + param_O[0] : c[0x0][0x140] + param_O[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_F[0] : c[0x0][0x150] + param_F[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_flags : c[0x0][0x15c] + param_C : c[0x0][0x160] + param_K : c[0x0][0x164] + param_N : c[0x0][0x168] + param_H : c[0x0][0x16c] + param_W : c[0x0][0x170] + param_HWN : c[0x0][0x174] + param_WN : c[0x0][0x178] + param_Y2 : c[0x0][0x17c] + param_GX : c[0x0][0x180] + param_Xk : c[0x0][0x184] + param_k : c[0x0][0x188] + param_magic_Xk : c[0x0][0x18c] + param_shift_Xk : c[0x0][0x190] + param_magic_k : c[0x0][0x194] + param_shift_k : c[0x0][0x198] + param_P : c[0x0][0x19c] + param_Q : c[0x0][0x1a0] + param_QN : c[0x0][0x1a4] + param_PQN : c[0x0][0x1a8] + param_PQNp : c[0x0][0x1ac] + param_PQN15p : c[0x0][0x1b0] + param_shiftY : c[0x0][0x1b4] + param_shiftX : c[0x0][0x1b8] + param_shiftN : c[0x0][0x1bc] + param_superY : c[0x0][0x1c0] + param_superX : c[0x0][0x1c4] + param_superN : c[0x0][0x1c8] + param_SuperY : c[0x0][0x1cc] + param_SuperX : c[0x0][0x1d0] + param_SuperN : c[0x0][0x1d4] + param_pad_x : c[0x0][0x1d8] + param_pad_y : c[0x0][0x1dc] + param_HWN2p : c[0x0][0x1e0] + param_C_1152 : c[0x0][0x1e4] + + + + + 0-63 : czero<00-63> + + // Image Transform + 52 = i00, TI00, I00 + 53 = i10, TI50, I50 + 54 = i01, TI01, I05 + 55 = i11, TI51, I55 + 56 = TI10, I10 + 57 = TI20, I20 + 58 = TI30, I30 + 59 = TI40, I40 + 60 = TI41, I45 + 61 = TI31, I35 + 62 = TI21, I25 + 63 = TI11, I15 + 64-67 : I0<1-4> + 68-71 : I5<1-4> + 72-75 : I1<1-4> + 76-79 : I2<1-4> + 80-83 : I3<1-4> + 84-87 : I4<1-4> + + // Filter Transform + 52-87 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>, F4<0-3>, F5<0-3>, F6<0-3>, F7<0-3>, F8<0-3> + + // Load Loop Registers + 3, 2,11,10 : clx<0-3>y0 + 7, 6,15,14 : clx<0-3>y1 + 1, 0, 9, 8 : clx<0-3>y2 + 5, 4,13,12 : clx<0-3>y3 + 19,18,27,26 : clx<0-3>y4 + 23,22,31,30 : clx<0-3>y5 + 17,16,25,24 : clx<0-3>y6 + 21,20,29,28 : clx<0-3>y7 + + 32-43 : jl0Ix<0-3>, jl0Fy<0-7> + 44-51 : jl1Ix<0-3>, jl1Fy<4-7> + 36-39 : jl1Fy<0-3> + + 32-51 ~ partialC, c, idx_K, idx_Y, idx_X, idx_N, tid31, gx, gy, offset, nn, x1, x2, y1, mask_x + 52-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, idx_Y2, idx_X2, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, super_x, super_y + 87 = tid + + // Compute Loop Registers + 3, 2,11,10,19,18,27,26 : ccx<0-7>y0 + 7, 6,15,14,23,22,31,30 : ccx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2 + 5, 4,13,12,21,20,29,28 : ccx<0-7>y3 + 35,34,43,42,51,50,59,58 : ccx<0-7>y4 + 39,38,47,46,55,54,63,62 : ccx<0-7>y5 + 33,32,41,40,49,48,57,56 : ccx<0-7>y6 + 37,36,45,44,53,52,61,60 : ccx<0-7>y7 + + 64-79 : jc0Ix<0-7>, jc0Fy<0-7> + 80-91 : jc1Ix<4-7>, jc1Fy<0-7> + 64-67 : jc1Ix<0-3> + + 64-86 ~ tid16, tid_1, tid128 + + // Shared Registers + 88-89 : track<0-1> + 92-95 ~ C, swapBuf, readFs, readIs + 90-91 ~ writeS, preds + + // Load Loop Finish + 32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1 + 48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16 + + // Compute Loop Finish + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + 64-87 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, four, z<1-5>, mask_q, offsetO, sign + 90-95 ~ writeCs, readCs, k, pred30, pred36, tid31_4 + 88-89 : Out<0-1> + + 3, 2,11,10,19,18 : m<0-5>0 + 27, 1,26, 0, 9, 8 : m<0-5>1 + 16,17,24,25,64,65 : m<0-5>2 + 66,67,68,69,70,71 : m<0-5>3 + 72,73,74,75,76,77 : m<0-5>4 + 78,79,80,81,82,83 : m<0-5>5 + + 3, 2,11,10,19,18 : w<0-5>0 + 27, 1,26, 0, 9, 8 : w<0-5>1 + 16,17,24,25,64,65 : w<0-5>2 + 66,67,68,69,70,71 : w<0-5>3 + 72,73,74,75,76,77 : w<0-5>4 + 78,79,80,81,82,83 : w<0-5>5 + + 3, 2,11,10,19,18 : s<0-5>0 + 27, 1,26, 0, 9, 8 : s<0-5>1 + 16,17,24,25,64,65 : s<0-5>2 + 66,67,68,69,70,71 : s<0-5>3 + 72,73,74,75,76,77 : s<0-5>4 + 78,79,80,81,82,83 : s<0-5>5 + + 85,84,86,87 : t<0-3>0 + 85,87,84,86 : t<0-3>1 + 85,84,87,86 : t<0-3>2 + 85,84,87,86 : t<0-3>3 + 85,84,87,86 : t<0-3>4 + 85,84,87,86 : t<0-3>5 + 85,84,87,86 : r0<0-3> + 85,84,87,86 : r1<0-3> + 85,87,86,84 : r2<0-3> + 84,85,86,87 : r3<0-3> + 85,84,87,86 : r4<0-3> + 84,85,87,86 : r5<0-3> + + + +--:-:-:-:0 MOV C, param_C; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:-:-:1 MOV swapBuf, 4x<32*36*2*2>; +01:-:-:-:0 ISETP.GE.AND P0, PT, tid, 128, PT; +--:-:-:-:1 STS.128 [addr_zero], RZ; +--:-:-:Y:c LOP.AND partialC, C, 1; +--:-:-:-:0 IADD C, C, partialC; +--:-:-:-:5 @P0 BRA.U COMPUTE_SETUP; + +############################################################## +LOAD_SETUP: + +--:-:1:-:1 S2R idx_YXk, SR_CTAID.X; +--:-:2:-:1 S2R idx_K, SR_CTAID.Y; +--:-:3:-:1 S2R idx_N, SR_CTAID.Z; + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +] + +--:-:-:-:1 ISETP.EQ.AND P0, PT, tid, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, tid, 64, PT; + +// idx_Y2 = idx_YXk / blk_Xk +--:-:-:-:1 MOV magic_Xk, param_magic_Xk; +--:-:-:-:1 IADD negXk, RZ, -param_Xk; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_Xk, 1, PT; +01:-:-:-:1 @P3 XMAD div1, idx_YXk, magic_Xk, RZ; +--:-:-:-:1 @P3 XMAD div2, idx_YXk, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, idx_YXk.H1, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk, div1; +--:-:-:-:1 @P3 IADD3.RS idx_Y2, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 idx_Y2, idx_Y2, param_shift_Xk; +--:-:-:-:1 @!P3 SHR.U32 idx_Y2, idx_YXk, param_shift_Xk; + +// idx_Xk = idx_YXk % blk_Xk +--:-:-:-:1 XMAD.LO2 idx_Xk, negXk, idx_Y2, idx_YXk; + +// idx_X2 = idx_Xk / blk_k +// idx_k = idx_Xk % blk_k +--:-:-:-:1 XMAD idx_X2, idx_Xk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_X2, idx_X2, param_shift_k; +--:-:-:-:1 XMAD idx_k, idx_X2, param_k, RZ; +--:-:-:-:1 IADD idx_k, -idx_k, idx_Xk; + +// idx_K = idx_K * blk_k + idx_k +02:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +//--:-:-:-:1 MOV idx_X, idx_X2; +//--:-:-:-:1 MOV idx_Y, idx_Y2; + +// gx = x2 +// gy = y2 * 2 +--:-:-:-:1 MOV idx_X, idx_X2; +--:-:-:-:1 SHL idx_Y, idx_Y2, 1; + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// if y2 != Y2: +// gy += (gx&1) ^ ((gx&2)>>1) +// gx /= 2 +--:-:-:-:1 ISETP.NE.AND P4, PT, idx_Y2, param_Y2, PT; +--:-:-:-:1 @P4 LOP.AND x1, idx_X, 1; +--:-:-:-:1 @P4 BFE.U32 x2, idx_X, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P4 LOP.XOR x1, x1, x2; +--:-:-:-:1 @P4 IADD idx_Y, idx_Y, x1; +--:-:-:-:1 @P4 SHR.U32 idx_X, idx_X, 1; + +// Scan backwards on odd rows +// if y2 & 1: +// gx = gridX - gx - 1 +--:-:-:-:1 LOP.AND.NZ P5, RZ, idx_Y2, 1; +--:-:-:-:1 @P5 IADD idx_X, -idx_X, param_GX; +--:-:-:-:1 @P5 IADD idx_X, idx_X, -1; + +--:-:-:-:1 @P0 STS [addr_idx_Y], idx_Y; +--:-:-:-:1 @P0 STS [addr_idx_X], idx_X; +--:-:-:-:1 @P0 STS [addr_idx_K], idx_K; + +// x = gx << shiftX +// y = gy << shiftY +--:-:-:-:1 SHL gx, idx_X, param_shiftX; +--:-:-:-:1 SHL gy, idx_Y, param_shiftY; + +// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp +--:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 super_y, tid, param_superY; +--:-:-:-:1 ISCADD gx, super_x, gx, 1; +--:-:-:-:1 ISCADD gy, super_y, gy, 1; + +--:-:-:-:1 LOP.AND tid32_2, tid, -32; +--:-:-:-:1 SHR.U32 tid32_2, tid32_2, 2; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid32_2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD3 readFs, readFs, tid1, tid32_2; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*36*2>, 4; + +// c = (tid & 32) >> 5 +--:-:-:-:1 BFE.U32 c, tid, 0x105; // 1 bits at position 5 + +// P6 = c == partialC == 1 +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, partialC, P6; + +--:-:-:-:1 LOP.AND tid31, tid, 31; + + +04:-:-:-:5 @P1 BRA.U FILTER_SETUP; + +############################################################## +IMAGE_SETUP: + + + +// writeS = c*32*36 + tid31 +--:-:-:-:1 XMAD writeS, c, 1152, tid31; +--:-:-:-:1 SHL writeS, writeS, 2; + +--:-:-:-:1 STS [writeS + 4x<32*0>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*1>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*2>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*3>], RZ; + +// n = idx_N< +--:-:-:-:1 @!P0 MOV i00, RZ; +--:-:2:-:1 @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>]; +--:-:-:-:1 @!P2 MOV i10, RZ; +--:-:3:-:1 @P2 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>]; +--:-:-:-:1 @!P1 MOV i01, RZ; +--:-:4:-:1 @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>]; +--:-:-:-:1 @!P3 MOV i11, RZ; +--:6:5:-:1 @P3 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>]; + + + +--:-:-:-:5 BAR.SYNC 0; + +20:-:-:-:0 IADD track0.CC, track0, -partialC; + +--:-:-:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:1:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; + +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:2 IADD swapBuf, RZ, -swapBuf; +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:5 BRA.U IMAGE_LOOP; + +############################################################## +FILTER_SETUP: + + + +// writeS = (c*32*36 + (tid & 31)*4 + 32*36*2)*4 +--:-:-:-:1 ISCADD writeS, tid31, 4x<32*36*2>, 4; +--:-:-:-:1 XMAD writeS, c, 4x<32*36>, writeS; + +--:-:-:-:1 STS.128 [writeS], RZ; + +// offset = c*32*36 + tid31*4 +--:-:-:-:1 SHL tid31, tid31, 2; +--:-:-:-:1 XMAD offset, c, 1x<32*36>, tid31; + +// (kBlks,C,6,6,32) +// offset += (idx_K*C*32*36) * itemsize; +--:-:-:-:1 XMAD.LO2C offset, idx_K, param_C_1152, offset; +--:-:-:-:1 LEA track0.CC, offset, param_F[0], [+ dshift() +]; +--:-:-:-:1 LEA.HI.X track1, offset, param_F[1], RZ, [+ dshift() +]; + +--:-:-:-:1 XMAD partialC, partialC, 1x<32*36 * $dsize>, RZ; + +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F0, [track + 4x<0*32 * $dsize>]; +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F1, [track + 4x<1*32 * $dsize>]; +--:-:2:-:1 @!P6 LDG.E.[+ vsize() +] F2, [track + 4x<2*32 * $dsize>]; + +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F0, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F1, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F2, [addr_zero]; + +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F3, [track + 4x<3*32 * $dsize>]; +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F4, [track + 4x<4*32 * $dsize>]; +--:-:3:-:1 @!P6 LDG.E.[+ vsize() +] F5, [track + 4x<5*32 * $dsize>]; + +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F3, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F4, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F5, [addr_zero]; + +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F6, [track + 4x<6*32 * $dsize>]; +--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F7, [track + 4x<7*32 * $dsize>]; +--:6:4:-:1 @!P6 LDG.E.[+ vsize() +] F8, [track + 4x<8*32 * $dsize>]; + +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F6, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F7, [addr_zero]; +--:-:-:-:1 @P6 LDS.U.[+ vsize() +] F8, [addr_zero]; + + +--:-:-:-:5 BAR.SYNC 0; + +20:-:-:-:0 IADD track0.CC, track0, -partialC; + +--:-:-:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:1:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; + +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:2 IADD swapBuf, RZ, -swapBuf; +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:5 BRA.U FILTER_LOOP; + +############################################################## + +COMPUTE_SETUP: + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +--:-:-:-:1 IADD tid128, tid, -128; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readFs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid128, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readIs, tid128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid16; +--:-:-:-:1 ISCADD readIs, readIs, 4x<32*4 + 32*36*2*2>, 4; + +--:-:-:-:1 LOP.AND tid_1, tid128, 1; +--:-:-:-:1 LOP.AND readFs, tid128, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 IADD3 readFs, readFs, tid16, tid_1; +--:-:-:-:0 ISCADD readFs, readFs, 4x<32*4 + 32*36*2*3>, 4; + + +--:-:-:-:5 BAR.SYNC 0; + +// Let Load loop run once to transform initial load and store to shared. +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>]; +--:-:1:-:2 LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>]; + +COMPUTE_LOOP: +[+ + my %insert = ( + + j0c33 => "--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT;\n" . + "--:-:-:-:1 IADD C, C, -2;\n", + + j0c62 => "02:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j1c63 => "--:-:-:Y:5 \@P0 BRA.U COMPUTE_LOOP;\n" . + "--:-:-:Y:5 BRA.U COMPUTE_FINISH;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 1) + { + my $odd = $j; + my $nOdd = 1 - $j; + my $rsPred = $j == 1 ? '@P0' : ' '; + my $bar = $j == 0 ? '2' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd; + + $insert{"j${j}c31"} = sprintf "--:%s:1:-:1 %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd; + + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + #$stall = '4' if $stall && $c % 2 == 0 && $j == 0 && $c > 16; + + my $yield = ($c % 5 == 0) && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +IMAGE_LOOP: +--:-:-:-:1 ISETP.GT.AND P6, PT, C, 2, PT; +[+ + our ($dtype, $dsize, $convert_in, $W, $N); + my %insert = ( + + j0c0 => "--:-:-:-:1 ISETP.GT.AND P5, PT, C, RZ, PT;\n" . + "--:-:-:-:1 IADD C, C, -2;\n", + + $convert_in ? ( + j0c1 => "02:-:2:-:1 F2F.F32.F16 i00, i00;\n", + j0c2 => "04:-:3:-:1 F2F.F32.F16 i10, i10;\n", + j0c3 => "08:-:4:-:1 F2F.F32.F16 i01, i01;\n", + j0c4 => "10:-:5:-:1 F2F.F32.F16 i11, i11;\n", + ) : (), + + j0c5 => "02:-:-:-:1 STS [writeS + 4x<32*(0*6 + 0)>], I00;\n", + j0c6 => "04:-:-:-:1 STS [writeS + 4x<32*(5*6 + 0)>], I50;\n", + + j0c7 => "--:-:-:-:1 FFMA TI10, i10, 0.75, i00;\n" . + "--:-:-:-:1 FFMA TI20, i10, -0.75, i00;\n" . + "--:-:-:-:1 FFMA TI30, i10, 1.50, i00;\n" . + "--:-:-:-:1 FFMA TI40, i10, -1.50, i00;\n" . + "--:-:-:-:1 IADD track0.CC, track0, param_HWN2p;\n" . + "--:-:-:-:1 @!P6 MOV preds, RZ;\n", + + j0c8 => "08:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 5)>], I05;\n", + j0c9 => "10:6:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 5)>], I55;\n", + + j0c10 => "--:-:-:-:0 FFMA TI11, i11, 0.75, i01;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 0)>], I10;\n" . + "--:-:-:-:0 FFMA TI21, i11, -0.75, i01;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 0)>], I20;\n" . + "--:-:-:-:0 FFMA TI31, i11, 1.50, i01;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 0)>], I30;\n" . + "--:-:-:-:0 FFMA TI41, i11, -1.50, i01;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 0)>], I40;\n" . + "--:-:-:-:1 R2P PR, preds, 0xf;\n" . + "--:-:-:-:1 IADD.X track1, track1, RZ;\n", + + j0c11 => "--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n", + j0c13 => "--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n", + j0c19 => "--:-:1:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + + j0c14 => "--:-:-:-:0 FFMA I01, TI01, 0.75, TI00;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 5)>], I15;\n" . + "--:-:-:-:0 FFMA I02, TI01, -0.75, TI00;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 5)>], I25;\n" . + "--:-:-:-:0 FFMA I03, TI01, 1.50, TI00;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 5)>], I35;\n" . + "--:-:-:-:0 FFMA I04, TI01, -1.50, TI00;\n" . + "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 5)>], I45;\n", + + j0c15 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 1)>], I01;\n", + j0c16 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 2)>], I02;\n", + j0c17 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 3)>], I03;\n", + j0c18 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(0*6 + 4)>], I04;\n", + + j0c20 => "--:-:-:-:1 FFMA I51, TI51, 0.75, TI50;\n" . + "--:-:-:-:1 FFMA I52, TI51, -0.75, TI50;\n" . + "--:-:-:-:1 FFMA I53, TI51, 1.50, TI50;\n" . + "--:-:-:-:1 FFMA I54, TI51, -1.50, TI50;\n", + + j0c21 => "20:-:2:-:1 \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n", + j0c22 => "--:-:3:-:1 \@P2 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n", + j0c23 => "--:-:4:-:1 \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n", + j0c24 => "--:-:5:-:1 \@P3 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n", + + j0c25 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 1)>], I51;\n", + j0c26 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 2)>], I52;\n", + j0c27 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 3)>], I53;\n", + j0c28 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(5*6 + 4)>], I54;\n", + + j0c29 => "--:-:-:-:1 FFMA I11, TI11, 0.75, TI10;\n" . + "--:-:-:-:1 FFMA I12, TI11, -0.75, TI10;\n" . + "--:-:-:-:1 FFMA I13, TI11, 1.50, TI10;\n" . + "--:-:-:-:1 FFMA I14, TI11, -1.50, TI10;\n", + + j0c30 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 1)>], I11;\n", + j0c31 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 2)>], I12;\n", + j1c0 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 3)>], I13;\n", + j1c1 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(1*6 + 4)>], I14;\n", + + j1c2 => "--:-:-:-:1 FFMA I21, TI21, 0.75, TI20;\n" . + "--:-:-:-:1 FFMA I22, TI21, -0.75, TI20;\n" . + "--:-:-:-:1 FFMA I23, TI21, 1.50, TI20;\n" . + "--:-:-:-:1 FFMA I24, TI21, -1.50, TI20;\n", + + j1c3 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 1)>], I21;\n", + j1c4 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 2)>], I22;\n", + j1c5 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 3)>], I23;\n", + j1c6 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(2*6 + 4)>], I24;\n", + + j1c7 => "--:-:-:-:1 FFMA I31, TI31, 0.75, TI30;\n" . + "--:-:-:-:1 FFMA I32, TI31, -0.75, TI30;\n" . + "--:-:-:-:1 FFMA I33, TI31, 1.50, TI30;\n" . + "--:-:-:-:1 FFMA I34, TI31, -1.50, TI30;\n", + + j1c8 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 1)>], I31;\n", + j1c9 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 2)>], I32;\n", + j1c10 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 3)>], I33;\n", + j1c11 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(3*6 + 4)>], I34;\n", + + j1c12 => "--:-:-:-:1 FFMA I41, TI41, 0.75, TI40;\n" . + "--:-:-:-:1 FFMA I42, TI41, -0.75, TI40;\n" . + "--:-:-:-:1 FFMA I43, TI41, 1.50, TI40;\n" . + "--:-:-:-:1 FFMA I44, TI41, -1.50, TI40;\n", + + j1c13 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 1)>], I41;\n", + j1c14 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 2)>], I42;\n", + j1c15 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 3)>], I43;\n", + j1c16 => "--:-:-:-:1 \@P5 STS [writeS + 4x<32*(4*6 + 4)>], I44;\n", + + j1c17 => "--:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P5 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P5 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P5 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P5 IADD swapBuf, RZ, -swapBuf;\n", + + j1c18 => "--:-:-:-:1 \@P5 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n", + j1c20 => "--:-:-:-:1 \@P5 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n", + j1c22 => "--:-:1:-:1 \@P5 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n", + + j1c31 => "--:-:-:Y:5 \@P5 BRA.U IMAGE_LOOP;\n" . + "--:-:-:Y:5 BRA.U LOAD_FINISH;", + + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "01" : '--'; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + + +FILTER_LOOP: +--:-:-:-:1 ISETP.GT.AND P0, PT, C, RZ, PT; +20:-:-:-:1 IADD track0.CC, track0, 1x<32*36*2 * $dsize>; +--:-:-:-:1 ISETP.GT.AND P1, PT, C, 2, PT; +--:-:-:-:1 IADD C, C, -2; +[+ + our ($vsize, $dsize, $convert_in); + my %insert = ( + + j0c3 => "--:-:-:-:1 IADD.X track1, track1, RZ;\n", + + j0c0 => "--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n", + j0c2 => "--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n", + j0c18 => "--:-:1:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + + j1c12 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n", + j1c14 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n", + j1c16 => "--:-:1:-:1 \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n", + + $convert_in ? ( + + j0c1 => "02:-:-:-:1 F2F.F32.F16 F03, F01.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F02, F01.H0;\n", + j0c4 => "--:-:-:-:1 F2F.F32.F16 F01, F00.H1;\n" . + "--:-:2:-:1 F2F.F32.F16 F00, F00.H0;\n", + + j0c5 => "--:-:-:-:1 F2F.F32.F16 F13, F11.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F12, F11.H0;\n", + j0c6 => "--:-:-:-:1 F2F.F32.F16 F11, F10.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 F10, F10.H0;\n", + + j0c7 => "--:-:-:-:1 F2F.F32.F16 F23, F21.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F22, F21.H0;\n", + j0c8 => "--:-:-:-:1 F2F.F32.F16 F21, F20.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 F20, F20.H0;\n", + + j0c9 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*32*4>], F0;\n", + j0c10 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], F1;\n", + j0c11 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], F2;\n", + + j0c13 => "02:-:-:-:1 \@P1 LDG.E.$vsize F0, [track + 4x<0*32 * $dsize>];\n", + j0c14 => "10:-:-:-:1 \@P1 LDG.E.$vsize F1, [track + 4x<1*32 * $dsize>];\n", + j0c15 => "20:-:2:-:1 \@P1 LDG.E.$vsize F2, [track + 4x<2*32 * $dsize>];\n", + + j0c16 => "04:-:-:-:1 F2F.F32.F16 F33, F31.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F32, F31.H0;\n", + j0c17 => "--:-:-:-:1 F2F.F32.F16 F31, F30.H1;\n" . + "--:-:3:-:1 F2F.F32.F16 F30, F30.H0;\n", + + j0c19 => "--:-:-:-:1 F2F.F32.F16 F43, F41.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F42, F41.H0;\n", + j0c20 => "--:-:-:-:1 F2F.F32.F16 F41, F40.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 F40, F40.H0;\n", + + j0c21 => "--:-:-:-:1 F2F.F32.F16 F53, F51.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F52, F51.H0;\n", + j0c22 => "--:-:-:-:1 F2F.F32.F16 F51, F50.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 F50, F50.H0;\n", + + j0c23 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], F3;\n", + j0c24 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], F4;\n", + j0c25 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], F5;\n", + + j0c27 => "04:-:-:-:1 \@P1 LDG.E.$vsize F3, [track + 4x<3*32 * $dsize>];\n", + j0c28 => "10:-:-:-:1 \@P1 LDG.E.$vsize F4, [track + 4x<4*32 * $dsize>];\n", + j0c29 => "20:-:3:-:1 \@P1 LDG.E.$vsize F5, [track + 4x<5*32 * $dsize>];\n", + + j0c30 => "08:-:-:-:1 F2F.F32.F16 F63, F61.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F62, F61.H0;\n", + j0c31 => "--:-:-:-:1 F2F.F32.F16 F61, F60.H1;\n" . + "--:-:4:-:1 F2F.F32.F16 F60, F60.H0;\n", + + j1c0 => "--:-:-:-:1 F2F.F32.F16 F73, F71.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F72, F71.H0;\n", + j1c1 => "--:-:-:-:1 F2F.F32.F16 F71, F70.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 F70, F70.H0;\n", + + j1c2 => "--:-:-:-:1 F2F.F32.F16 F83, F81.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 F82, F81.H0;\n", + j1c3 => "--:-:-:-:1 F2F.F32.F16 F81, F80.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 F80, F80.H0;\n", + + j1c4 => "08:4:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], F6;\n", + j1c5 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], F7;\n", + j1c6 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], F8;\n", + + j1c8 => "08:-:-:-:1 \@P1 LDG.E.$vsize F6, [track + 4x<6*32 * $dsize>];\n", + j1c9 => "10:-:-:-:1 \@P1 LDG.E.$vsize F7, [track + 4x<7*32 * $dsize>];\n", + j1c10 => "20:6:4:-:1 \@P1 LDG.E.$vsize F8, [track + 4x<8*32 * $dsize>];\n", + + ) : ( + + j0c6 => "02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], F0;\n", + j0c8 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], F1;\n", + j0c10 => "--:2:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], F2;\n", + + j0c12 => "02:-:-:-:1 \@P1 LDG.E.$vsize F0, [track + 4x<0*32 * $dsize>];\n", + j0c14 => "--:-:-:-:1 \@P1 LDG.E.$vsize F1, [track + 4x<1*32 * $dsize>];\n", + j0c16 => "--:-:2:-:1 \@P1 LDG.E.$vsize F2, [track + 4x<2*32 * $dsize>];\n", + + j0c20 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], F3;\n", + j0c22 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], F4;\n", + j0c24 => "--:3:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], F5;\n", + + j0c26 => "04:-:-:-:1 \@P1 LDG.E.$vsize F3, [track + 4x<3*32 * $dsize>];\n", + j0c28 => "--:-:-:-:1 \@P1 LDG.E.$vsize F4, [track + 4x<4*32 * $dsize>];\n", + j0c30 => "--:-:3:-:1 \@P1 LDG.E.$vsize F5, [track + 4x<5*32 * $dsize>];\n", + + j1c0 => "08:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], F6;\n", + j1c2 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], F7;\n", + j1c4 => "--:4:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], F8;\n", + + j1c6 => "08:-:-:-:1 \@P1 LDG.E.$vsize F6, [track + 4x<6*32 * $dsize>];\n", + j1c8 => "--:-:-:-:1 \@P1 LDG.E.$vsize F7, [track + 4x<7*32 * $dsize>];\n", + j1c10 => "--:6:4:-:1 \@P1 LDG.E.$vsize F8, [track + 4x<8*32 * $dsize>];\n", + ), + + j1c11 => "--:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j1c31 => "--:-:-:Y:5 \@P0 BRA.U FILTER_LOOP;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "01" : '--'; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + +LOAD_FINISH: + +//--:-:-:-:5 EXIT; + + +--:-:1:-:2 S2R Tid, SR_TID.X; + +--:-:-:-:1 MOV alpha16, param_alpha; + +01:-:-:-:1 LOP.AND Tid32_2, Tid, -32; +--:-:-:-:1 SHR.U32 Tid32_2, Tid32_2, 2; + +// readFs = ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND Tid1, Tid, 1; +01:-:-:-:1 LOP.AND readFs, Tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD readFs, readFs, Tid1; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2) +--:-:-:-:1 BFE.U32 readIs, Tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, Tid32_2; +--:-:-:-:1 ISCADD readIs, readFs, readIs, 2; + +--:-:-:-:1 SHL readIs, readIs, 4; +--:-:-:-:1 SHL readFs, readFs, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:1 XMAD write16Cs, readFs, 1x<32*36>, readIs; + + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y2, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y2, alpha16; +--:-:-:-:4 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y3, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y3, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y6, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y6, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y7, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y7, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 EXIT; + + +COMPUTE_FINISH: + +//--:-:-:-:5 EXIT; + + +--:-:1:-:2 S2R tid_128, SR_TID.X; + + +01:-:-:-:1 IADD tid_128, tid_128, -128; + +--:-:-:-:1 ISETP.GE.AND P6, PT, tid_128, 256, PT; + +// readFs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND Tid_1, tid_128, 1; +--:-:-:-:1 LOP.AND readFs2, tid_128, 8; +--:-:-:-:1 SHR.U32 readFs2, readFs2, 2; +--:-:-:-:1 IADD readFs2, readFs2, Tid_1; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2) +--:-:-:-:1 LOP.AND tid_16, tid_128, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; +--:-:-:-:1 BFE.U32 readIs2, tid_128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs2, readIs2, tid_16; +--:-:-:-:1 ISCADD readIs2, readFs2, readIs2, 2; + +--:-:-:-:1 ISCADD readIs2, readIs2, 4x<32*4>, 4; +--:-:-:-:1 SHL readFs2, readFs2, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:0 XMAD writeCs, readFs2, 1x<32*36>, readIs2; + + +--:-:-:-:5 @P6 BRA.U SKIP0; + +--:-:2:-:1 LDS idxX, [addr_idx_X]; +--:-:3:-:1 LDS idxY, [addr_idx_Y]; +--:-:1:-:1 S2R idxN, SR_CTAID.Z; +--:-:4:-:1 LDS idxK, [addr_idx_K]; + + +--:-:-:-:1 LOP.AND tid_31, tid_128, 31; +--:-:-:-:1 SHR.U32 tid_32, tid_128, 5; +--:-:-:-:1 SHR.U32 tid_64, tid_128, 6; + + +// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16 +--:-:-:-:1 XMAD readCs, tid_32, 1x<32*36>, tid_31; +--:-:-:-:1 ISCADD readCs, tid_64, readCs, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// Superblock offset +// idxX <<= shiftX +// idxX <<= shiftY +04:-:-:-:1 SHL idxY, idxY, param_shiftY; +02:-:-:-:1 SHL idxX, idxX, param_shiftX; +01:-:-:-:1 SHL idxN, idxN, param_shiftN; + +// Get this threads offset within the superblock +--:-:-:-:1 BFE.U32 p, tid_31, param_SuperY; +--:-:-:-:1 BFE.U32 q, tid_31, param_SuperX; +--:-:-:-:1 LOP.AND n, tid_31, param_SuperN; + +--:-:-:-:1 ISCADD q, q, idxX, 1; +--:-:-:-:1 ISCADD p, p, idxY, 1; + +--:-:-:-:1 MOV four, -4; +--:-:-:-:1 IADD3 q, q, param_pad_x, four; +--:-:-:-:1 IADD3 p, p, param_pad_y, four; + +[+ + our ($type, $N); + if ($type eq 'h') + { + return q{ +--:-:-:-:1 SHL tid31_4, tid_31, 2; + +--:-:-:-:1 ISCADD n, n, idxN, 1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tid_31, 16, PT; + } + } + else { + return q{ +--:-:-:-:1 IADD n, n, idxN; +--:-:-:-:1 ISETP.LT.AND P6, PT, n, param_N, PT; + }; + } ++] + +// k = idxK*32 + tid_32<<1 +--:-:-:-:1 SHL tid_32, tid_32, 1; +08:-:-:-:1 ISCADD k, idxK, tid_32, 5; + +// Out = k*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD.S16.U16 offsetO, q, param_N, n; +--:-:-:-:1 XMAD.S16.U16.LO2C offsetO, p, param_QN, offsetO; +--:-:-:-:1 XMAD.U16.U16.LO2C offsetO, k, param_PQN, offsetO; +--:-:-:-:1 ISET.LT.AND sign, offsetO, RZ, PT; + +--:-:-:-:1 LEA Out0.CC, offsetO, param_O[0], [+ dshift() +]; +--:-:-:-:1 IADD.X Out1, sign, param_O[1]; + +--:-:-:-:1 ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op + +--:-:-:-:1 IADD z1, q, 1; +--:-:-:-:1 IADD z2, q, 2; +--:-:-:-:1 IADD z3, q, 3; +--:-:-:-:1 IADD z4, q, 4; +--:-:-:-:1 IADD z5, q, 5; +--:-:-:-:1 ISETP.LT.AND P0, PT, q, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, z4, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P5, PT, z5, param_Q, P5; +--:-:-:-:1 ISETP.GE.AND P0, PT, q, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, z4, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, z5, RZ, P5; +--:-:-:-:1 P2R mask_q, PR, RZ, 0x3f; + +--:-:-:-:1 IADD z1, p, 1; +--:-:-:-:1 IADD z2, p, 2; +--:-:-:-:1 IADD z3, p, 3; +--:-:-:-:1 IADD z4, p, 4; +--:-:-:-:1 IADD z5, p, 5; +--:-:-:-:1 ISETP.LT.AND P0, PT, p, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P4, PT, z4, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, z5, param_P, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, p, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, z4, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, z5, RZ, P5; + +--:-:-:-:1 SEL pred30, mask_q, RZ, P0; +--:-:-:-:1 @P1 BFI pred30, mask_q, 0x606, pred30; +--:-:-:-:1 @P2 BFI pred30, mask_q, 0x60c, pred30; +--:-:-:-:1 @P3 BFI pred30, mask_q, 0x612, pred30; +--:-:-:-:1 @P4 BFI pred30, mask_q, 0x618, pred30; +--:-:-:-:1 SEL pred36, mask_q, RZ, P5; + +--:-:-:-:1 ISETP.GE.AND P6, PT, tid_128, 256, PT; + + + +SKIP0: + + +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x7y0, ccx7y0, param_alpha; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y1, ccx3y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y2, param_alpha; +--:-:-:-:1 FMUL shuffle_x7y1, ccx7y2, param_alpha; + +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P6 BRA.U SKIP1; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP1: + +--:-:-:-:0 IADD k, k, 1; +--:-:-:-:5 BAR.SYNC 0; +01:-:-:-:1 IADD Out0.CC, Out0, param_PQNp; +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y1, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y1, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y1, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y3, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y3, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y3, param_alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y3, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y3, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y3, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y3, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y3, param_alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:0 IADD.X Out1, Out1, RZ; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P6 BRA.U SKIP2; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP2: + +--:-:-:-:0 IADD k, k, 15; +--:-:-:-:5 BAR.SYNC 0; +01:-:-:-:1 IADD Out0.CC, Out0, param_PQN15p; +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y4, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y4, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y4, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y6, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y6, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y6, param_alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y6, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y6, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y6, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y6, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y6, param_alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:0 IADD.X Out1, Out1, RZ; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P6 BRA.U SKIP3; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP3: + +--:-:-:-:0 IADD k, k, 1; +--:-:-:-:5 BAR.SYNC 0; +01:-:-:-:1 IADD Out0.CC, Out0, param_PQNp; +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y5, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y5, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y5, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y7, param_alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y7, param_alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y7, param_alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y7, param_alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y7, param_alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y7, param_alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y7, param_alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y7, param_alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:0 IADD.X Out1, Out1, RZ; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P6 BRA.U SKIP4;S +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP4: + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + + +01:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, PT; +--:-:-:-:1 @!P0 MOV pred30, RZ; +--:-:-:-:1 @!P0 MOV pred36, RZ; +[+ + my $out; + foreach my $i (0 .. 2) + { + foreach my $j (0 .. 5) + { + my $b = $i + 1; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + + + +// t0 = I[1,:] + I[2,:] +// t1 = I[1,:] - I[2,:] +// t2 = I[3,:] + I[4,:] +// t3 = I[3,:] - I[4,:] +// O[2,:] = t0 * -2.25 + t2 * -0.5625 + I[0,:] * -2.8125 +// O[1,:] = t1 * -1.6875 + t3 * -0.84375 + I[5,:] * 1.265625 +// O[3,:] = t1 * 0.75 + t3 * 1.5 + I[5,:] * -2.8125 +// O[4,:] = I[0,:] + t0 + t2 +// O[0,:] = I[0,:] * 1.265625 +// O[5,:] = I[5,:] +[+ + my $out; + foreach my $i (0 .. 2) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ +$w:-:-:-:1 FADD t0$i, m1$i, m2$i; +--:-:-:-:1 FADD t1$i, m1$i, -m2$i; +--:-:-:-:1 FADD t2$i, m3$i, m4$i; +--:-:-:-:1 FADD t3$i, m3$i, -m4$i; +--:-:-:-:1 FMUL w2$i, m0$i, -2.8125; +--:-:-:-:1 FFMA w2$i, t0$i, -2.25, w2$i; +--:-:-:-:1 FFMA w2$i, t2$i, -0.5625, w2$i; +--:-:-:-:1 FMUL w1$i, m5$i, 1.265625; +--:-:-:-:1 FFMA w1$i, t1$i, -1.6875, w1$i; +--:-:-:-:1 FFMA w1$i, t3$i, -0.84375, w1$i; +--:-:-:-:1 FMUL w3$i, m5$i, -2.8125; +--:-:-:-:1 FFMA w3$i, t1$i, 0.75, w3$i; +--:-:-:-:1 FFMA w3$i, t3$i, 1.5, w3$i; +--:-:-:-:1 FADD w4$i, m0$i, t0$i; +--:-:-:-:1 FADD w4$i, w4$i, t2$i; +--:-:-:-:1 FMUL w0$i, m0$i, 1.265625; + }; + } + foreach my $i (3 .. 5) + { + foreach my $j (0 .. 5) + { + my $b = $i + 1; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + + + +[+ + my $out; + foreach my $i (3 .. 5) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ +$w:-:-:-:1 FADD t0$i, m1$i, m2$i; +--:-:-:-:1 FADD t1$i, m1$i, -m2$i; +--:-:-:-:1 FADD t2$i, m3$i, m4$i; +--:-:-:-:1 FADD t3$i, m3$i, -m4$i; +--:-:-:-:1 FMUL w2$i, m0$i, -2.8125; +--:-:-:-:1 FFMA w2$i, t0$i, -2.25, w2$i; +--:-:-:-:1 FFMA w2$i, t2$i, -0.5625, w2$i; +--:-:-:-:1 FMUL w1$i, m5$i, 1.265625; +--:-:-:-:1 FFMA w1$i, t1$i, -1.6875, w1$i; +--:-:-:-:1 FFMA w1$i, t3$i, -0.84375, w1$i; +--:-:-:-:1 FMUL w3$i, m5$i, -2.8125; +--:-:-:-:1 FFMA w3$i, t1$i, 0.75, w3$i; +--:-:-:-:1 FFMA w3$i, t3$i, 1.5, w3$i; +--:-:-:-:1 FADD w4$i, m0$i, t0$i; +--:-:-:-:1 FADD w4$i, w4$i, t2$i; +--:-:-:-:1 FMUL w0$i, m0$i, 1.265625; + }; + } + return $out; ++] +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; + + + +[+ + my $out; + foreach my $i (0 .. 5) + { + $out .= qq{ +--:-:-:-:1 FADD r${i}0, w${i}1, w${i}2; +--:-:-:-:1 FADD r${i}1, w${i}1, -w${i}2; +--:-:-:-:1 FADD r${i}2, w${i}3, w${i}4; +--:-:-:-:1 FADD r${i}3, w${i}3, -w${i}4; +--:-:-:-:1 FMUL s${i}2, w${i}0, -2.8125; +--:-:-:-:1 FFMA s${i}2, r${i}0, -2.25, s${i}2; +--:-:-:-:1 FFMA s${i}2, r${i}2, -0.5625, s${i}2; +--:-:-:-:1 FMUL s${i}1, w${i}5, 1.265625; +--:-:-:-:1 FFMA s${i}1, r${i}1, -1.6875, s${i}1; +--:-:-:-:1 FFMA s${i}1, r${i}3, -0.84375, s${i}1; +--:-:-:-:1 FMUL s${i}3, w${i}5, -2.8125; +--:-:-:-:1 FFMA s${i}3, r${i}1, 0.75, s${i}3; +--:-:-:-:1 FFMA s${i}3, r${i}3, 1.5, s${i}3; +--:-:-:-:1 FADD s${i}4, w${i}0, r${i}0; +--:-:-:-:1 FADD s${i}4, s${i}4, r${i}2; +--:-:-:-:1 FMUL s${i}0, w${i}0, 1.265625; + }; + } + return $out; ++] +[+ + our $type; + return $type eq 'h' ? q{ + +--:-:-:-:1 IADD readCs, readCs, -tid31_4; +--:-:-:-:1 SHR.U32 tid31_4, tid31_4, 1; +--:-:-:-:1 IADD readCs, readCs, tid31_4; + + +--:-:-:-:1 F2F.F16.F32 s05, s05; +--:-:-:-:1 F2F.F16.F32 s00, s00; +--:-:-:-:1 F2F.F16.F32 s02, s02; +--:-:-:-:1 F2F.F16.F32 s01, s01; +--:-:-:-:1 F2F.F16.F32 s03, s03; +--:-:1:-:1 F2F.F16.F32 s04, s04; + +--:-:-:-:1 F2F.F16.F32 s15, s15; +--:-:-:-:1 F2F.F16.F32 s10, s10; +--:-:-:-:1 F2F.F16.F32 s12, s12; +--:-:-:-:1 F2F.F16.F32 s11, s11; +--:-:-:-:1 F2F.F16.F32 s13, s13; +--:-:2:-:1 F2F.F16.F32 s14, s14; + +01:-:-:-:1 STS.U16 [readCs + 4x<(0*6+0)*32>], s00; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+1)*32>], s01; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+2)*32>], s02; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+3)*32>], s03; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+4)*32>], s04; +--:-:-:-:1 STS.U16 [readCs + 4x<(0*6+5)*32>], s05; + +--:-:-:-:1 F2F.F16.F32 s25, s25; +--:-:-:-:1 F2F.F16.F32 s20, s20; +--:-:-:-:1 F2F.F16.F32 s22, s22; +--:-:-:-:1 F2F.F16.F32 s21, s21; +--:-:-:-:1 F2F.F16.F32 s23, s23; +--:-:3:-:1 F2F.F16.F32 s24, s24; + +02:-:-:-:1 STS.U16 [readCs + 4x<(1*6+0)*32>], s10; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+1)*32>], s11; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+2)*32>], s12; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+3)*32>], s13; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+4)*32>], s14; +--:-:-:-:1 STS.U16 [readCs + 4x<(1*6+5)*32>], s15; + +--:-:-:-:1 F2F.F16.F32 s35, s35; +--:-:-:-:1 F2F.F16.F32 s30, s30; +--:-:-:-:1 F2F.F16.F32 s32, s32; +--:-:-:-:1 F2F.F16.F32 s31, s31; +--:-:-:-:1 F2F.F16.F32 s33, s33; +--:-:4:-:1 F2F.F16.F32 s34, s34; + +04:-:-:-:1 STS.U16 [readCs + 4x<(2*6+0)*32>], s20; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+1)*32>], s21; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+2)*32>], s22; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+3)*32>], s23; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+4)*32>], s24; +--:-:-:-:1 STS.U16 [readCs + 4x<(2*6+5)*32>], s25; + +--:-:-:-:1 F2F.F16.F32 s45, s45; +--:-:-:-:1 F2F.F16.F32 s40, s40; +--:-:-:-:1 F2F.F16.F32 s42, s42; +--:-:-:-:1 F2F.F16.F32 s41, s41; +--:-:-:-:1 F2F.F16.F32 s43, s43; +--:-:5:-:1 F2F.F16.F32 s44, s44; + +08:-:-:-:1 STS.U16 [readCs + 4x<(3*6+0)*32>], s30; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+1)*32>], s31; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+2)*32>], s32; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+3)*32>], s33; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+4)*32>], s34; +--:-:-:-:1 STS.U16 [readCs + 4x<(3*6+5)*32>], s35; + +--:-:-:-:1 F2F.F16.F32 s55, s55; +--:-:-:-:1 F2F.F16.F32 s50, s50; +--:-:-:-:1 F2F.F16.F32 s52, s52; +--:-:-:-:1 F2F.F16.F32 s51, s51; +--:-:-:-:1 F2F.F16.F32 s53, s53; +--:-:6:-:1 F2F.F16.F32 s54, s54; + +10:-:-:-:1 STS.U16 [readCs + 4x<(4*6+0)*32>], s40; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+1)*32>], s41; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+2)*32>], s42; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+3)*32>], s43; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+4)*32>], s44; +--:-:-:-:1 STS.U16 [readCs + 4x<(4*6+5)*32>], s45; + +20:-:-:-:1 STS.U16 [readCs + 4x<(5*6+0)*32>], s50; +--:-:-:-:1 STS.U16 [readCs + 4x<(5*6+1)*32>], s51; +--:-:-:-:1 STS.U16 [readCs + 4x<(5*6+2)*32>], s52; +--:-:-:-:1 STS.U16 [readCs + 4x<(5*6+3)*32>], s53; +--:-:-:-:1 STS.U16 [readCs + 4x<(5*6+4)*32>], s54; +--:1:-:-:2 STS.U16 [readCs + 4x<(5*6+5)*32>], s55; // FORCE + + +01:-:-:-:1 IADD readCs, readCs, -tid31_4; +--:-:-:-:1 SHL tid31_4, tid31_4, 1; +--:-:-:-:4 IADD readCs, readCs, tid31_4; + + } : q{ +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 0*$N>], s00; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 1*$N>], s01; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 2*$N>], s02; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 3*$N>], s03; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 4*$N>], s04; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 5*$N>], s05; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 0*$N>], s10; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 1*$N>], s11; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 2*$N>], s12; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 3*$N>], s13; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 4*$N>], s14; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 5*$N>], s15; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 0*$N>], s20; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 1*$N>], s21; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 2*$N>], s22; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 3*$N>], s23; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 4*$N>], s24; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 5*$N>], s25; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 0*$N>], s30; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 1*$N>], s31; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 2*$N>], s32; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 3*$N>], s33; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 4*$N>], s34; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 5*$N>], s35; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.L.U64 pred30, pred30, 24, pred30; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 0*$N>], s40; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 1*$N>], s41; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 2*$N>], s42; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 3*$N>], s43; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 4*$N>], s44; +--:-:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 5*$N>], s45; +--:-:-:-:1 R2P PR, pred36, 0x3f; +--:-:-:-:1 @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 0*$N>], s50; +--:-:-:-:1 @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 1*$N>], s51; +--:-:-:-:1 @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 2*$N>], s52; +--:-:-:-:1 @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 3*$N>], s53; +--:-:-:-:1 @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 4*$N>], s54; +--:1:-:-:1 @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 5*$N>], s55; + }; ++] + + +[+ + our $type; + return $type eq 'h' ? q{ +--:-:-:-:1 LDS.U.32 s00, [readCs + 4x<(0*6+0)*32>]; +--:-:-:-:1 LDS.U.32 s01, [readCs + 4x<(0*6+1)*32>]; +--:-:-:-:1 LDS.U.32 s02, [readCs + 4x<(0*6+2)*32>]; +--:-:-:-:1 LDS.U.32 s03, [readCs + 4x<(0*6+3)*32>]; +--:-:-:-:1 LDS.U.32 s04, [readCs + 4x<(0*6+4)*32>]; +--:-:1:-:1 LDS.U.32 s05, [readCs + 4x<(0*6+5)*32>]; + +--:-:-:-:1 LDS.U.32 s10, [readCs + 4x<(1*6+0)*32>]; +--:-:-:-:1 LDS.U.32 s11, [readCs + 4x<(1*6+1)*32>]; +--:-:-:-:1 LDS.U.32 s12, [readCs + 4x<(1*6+2)*32>]; +--:-:-:-:1 LDS.U.32 s13, [readCs + 4x<(1*6+3)*32>]; +--:-:-:-:1 LDS.U.32 s14, [readCs + 4x<(1*6+4)*32>]; +--:-:2:-:1 LDS.U.32 s15, [readCs + 4x<(1*6+5)*32>]; + +--:-:-:-:1 LDS.U.32 s20, [readCs + 4x<(2*6+0)*32>]; +--:-:-:-:1 LDS.U.32 s21, [readCs + 4x<(2*6+1)*32>]; +--:-:-:-:1 LDS.U.32 s22, [readCs + 4x<(2*6+2)*32>]; +--:-:-:-:1 LDS.U.32 s23, [readCs + 4x<(2*6+3)*32>]; +--:-:-:-:1 LDS.U.32 s24, [readCs + 4x<(2*6+4)*32>]; +--:-:3:-:1 LDS.U.32 s25, [readCs + 4x<(2*6+5)*32>]; + + + +01:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 0*$N>], s00; +--:-:-:-:1 LDS.U.32 s30, [readCs + 4x<(3*6+0)*32>]; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 1*$N>], s01; +--:-:-:-:1 LDS.U.32 s31, [readCs + 4x<(3*6+1)*32>]; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 2*$N>], s02; +--:-:-:-:1 LDS.U.32 s32, [readCs + 4x<(3*6+2)*32>]; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 3*$N>], s03; +--:-:-:-:1 LDS.U.32 s33, [readCs + 4x<(3*6+3)*32>]; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 4*$N>], s04; +--:-:-:-:1 LDS.U.32 s34, [readCs + 4x<(3*6+4)*32>]; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 5*$N>], s05; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:4:-:1 LDS.U.32 s35, [readCs + 4x<(3*6+5)*32>]; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 LDS.U.32 s40, [readCs + 4x<(4*6+0)*32>]; +02:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 0*$N>], s10; +--:-:-:-:1 LDS.U.32 s41, [readCs + 4x<(4*6+1)*32>]; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 1*$N>], s11; +--:-:-:-:1 LDS.U.32 s42, [readCs + 4x<(4*6+2)*32>]; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 2*$N>], s12; +--:-:-:-:1 LDS.U.32 s43, [readCs + 4x<(4*6+3)*32>]; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 3*$N>], s13; +--:-:-:-:1 LDS.U.32 s44, [readCs + 4x<(4*6+4)*32>]; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 4*$N>], s14; +--:-:5:-:1 LDS.U.32 s45, [readCs + 4x<(4*6+5)*32>]; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 5*$N>], s15; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 LDS.U.32 s50, [readCs + 4x<(5*6+0)*32>]; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 LDS.U.32 s51, [readCs + 4x<(5*6+1)*32>]; +04:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 0*$N>], s20; +--:-:-:-:1 LDS.U.32 s52, [readCs + 4x<(5*6+2)*32>]; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 1*$N>], s21; +--:-:-:-:1 LDS.U.32 s53, [readCs + 4x<(5*6+3)*32>]; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 2*$N>], s22; +--:-:-:-:1 LDS.U.32 s54, [readCs + 4x<(5*6+4)*32>]; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 3*$N>], s23; +--:-:6:-:1 LDS.U.32 s55, [readCs + 4x<(5*6+5)*32>]; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 4*$N>], s24; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 5*$N>], s25; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; + +08:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 0*$N>], s30; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 1*$N>], s31; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 2*$N>], s32; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 3*$N>], s33; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 4*$N>], s34; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 5*$N>], s35; +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.L.U64 pred30, pred30, 24, pred30; +10:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 0*$N>], s40; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 1*$N>], s41; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 2*$N>], s42; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 3*$N>], s43; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 4*$N>], s44; +--:-:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 5*$N>], s45; +--:-:-:-:1 R2P PR, pred36, 0x3f; +20:-:-:-:1 @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 0*$N>], s50; +--:-:-:-:1 @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 1*$N>], s51; +--:-:-:-:1 @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 2*$N>], s52; +--:-:-:-:1 @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 3*$N>], s53; +--:-:-:-:1 @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 4*$N>], s54; +--:1:-:-:1 @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 5*$N>], s55; + + + } : ''; ++] + +--:-:-:-:5 RET; + +// RED.E.ADD.F16x2.FTZ.RN \ No newline at end of file diff --git a/Kernel/Convolution/Pascal/xconv_winograd_3x3_2x2_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_3x3_2x2_32x32.sass new file mode 100644 index 0000000..fe1dc07 --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_winograd_3x3_2x2_32x32.sass @@ -0,0 +1,1814 @@ + +# Copyright 2015 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our ($type, $IX, $D); +our $determ = $D; +our $dtype = $type eq 'h' ? '.U16' : ''; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $vec_size = $type eq 'h' ? '64' : '128'; +our $dtype_shift = $type eq 'h' ? '1' : '2'; +our $dtype_size = $type eq 'h' ? '2' : '4'; +sub dtype { return $dtype; } +sub dtype_shift { return $dtype_shift; } +sub output_op { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; } +-] + + + + addr_zero : 4x<(512*4 + 32)*4 + 0> + addr_blk_K : 4x<(512*4 + 32)*4 + 4> + addr_blk_C : 4x<(512*4 + 32)*4 + 5> + addr_blk_P : 4x<(512*4 + 32)*4 + 6> + addr_blk_Q : 4x<(512*4 + 32)*4 + 7> + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_Y : c[0x0][0x15c] + param_X : c[0x0][0x160] + param_P : c[0x0][0x164] + param_Q : c[0x0][0x168] + param_C : c[0x0][0x16c] + param_K : c[0x0][0x170] + param_N : c[0x0][0x174] + param_pad_y : c[0x0][0x178] + param_pad_x : c[0x0][0x17c] + param_GY : c[0x0][0x180] + param_GX : c[0x0][0x184] + param_GYS : c[0x0][0x188] + param_GXS : c[0x0][0x18c] + param_shiftYI : c[0x0][0x190] + param_shiftXI : c[0x0][0x194] + param_superYI : c[0x0][0x198] + param_superXI : c[0x0][0x19c] + param_superNI : c[0x0][0x1a0] + param_shiftY : c[0x0][0x1a4] + param_shiftX : c[0x0][0x1a8] + param_superY : c[0x0][0x1ac] + param_superX : c[0x0][0x1b0] + param_superN : c[0x0][0x1b4] + param_loopXI : c[0x0][0x1b8] + param_loopX : c[0x0][0x1bc] + param_loopN : c[0x0][0x1c0] + param_strideY : c[0x0][0x1c4] + param_strideX : c[0x0][0x1c8] + param_XN : c[0x0][0x1cc] + param_YXN : c[0x0][0x1d0] + param_QN : c[0x0][0x1d4] + param_PQN : c[0x0][0x1d8] + param_SK : c[0x0][0x1dc] + param_RSK : c[0x0][0x1e0] + param_Np : c[0x0][0x1e4] + param_XNp : c[0x0][0x1e8] + param_2XNp : c[0x0][0x1ec] + param_QNp : c[0x0][0x1f0] + param_CPQkc : c[0x0][0x1f4] + param_PQkc : c[0x0][0x1f8] + param_Qkc : c[0x0][0x1fc] + param_kc : c[0x0][0x200] + param_c : c[0x0][0x204] + param_k : c[0x0][0x208] + param_magic_CPQkc : c[0x0][0x20c] + param_shift_CPQkc : c[0x0][0x210] + param_magic_PQkc : c[0x0][0x214] + param_shift_PQkc : c[0x0][0x218] + param_magic_Qkc : c[0x0][0x21c] + param_shift_Qkc : c[0x0][0x220] + param_magic_kc : c[0x0][0x224] + param_shift_kc : c[0x0][0x228] + param_magic_c : c[0x0][0x22c] + param_shift_c : c[0x0][0x230] + param_CRSK : c[0x0][0x234] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + + 64-79 : j0Ex<0-7>, j0Iy<0-7> + 80-95 : j1Ex<0-7>, j1Iy<0-7> + + 64-79 ~ blk_KCPQkc, blk_CPQkc, blk_PQkc, blk_Qkc, blk_kc, blk_k, blk_c, blk_K, blk_C, blk_P, magic_CPQkc, magic_PQkc, magic_Qkc + 84-95 ~ div1, div2, div3, tidX, tidY, tid16, tid1, neg_CPQkc, neg_PQkc, neg_Qkc, neg_kc, neg_c + + 80-82 : init, tid, blk_Q + 83 = blkC, blkK + 84-95 ~ x, x<1-3>, y, super_x, super_y, tid_X, c, offsign, mask_x, mask_y + 84-95 ~ nloop, N + 81 = off_sign + 64 = swapBuf + + 96-103 : track0<0-1>, track1<0-1>, track2<0-1>, track3<0-1> + + 120-127 ~ writeS, readEs, readIs, pred_bits, gys, gxs, n, offset + + 0-31 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3>, t0<0-3>, t1<0-3>, t2<0-3> + 64-72 : f0<0-2>, f1<0-2>, f2<0-2> + 76-79 : blkKCPQ<0-3> + 76-79 : K_blk, C_blk, P_blk, Q_blk + 84-95 ~ CRSK, xmad_determ, PQ_blk + 96-109 ~ alpha, writeCs, readCs, cc, RSK8, tid_1, tid_16, tid_31, tid_32, kk, trackF, K1, SK1 + 110-115 : F00_<0-1>, F01_<0-1>, F02_<0-1>, + 116-121 : F10_<0-1>, F11_<0-1>, F12_<0-1>, + 122-127 : F20_<0-1>, F21_<0-1>, F22_<0-1> +[+ + our $IX; + return $IX ? q{ + 96-99 : trackI<0-1>, offsetI<0-1> + 100-103 ~ swapBuffer, gy, gx + + 104-119 : I0<0-3>, I1<0-3>, I2<0-3>, I3<0-3> + } : q{ + // registers reorded to avoid bank conflicts + 104 = y0x0, Y0X0, I00, Y1X0 + 105 = y0x1, Y0X1, I02, Y1X2 + 106 = y0x2, Y0X2, I13 + 107 = y0x3, Y0X3, I03, Y1X3 + 108 = y1x0, I04 + 110 = y1x1, I05 + 109 = y1x2, I06 + 111 = y1x3, I07 + 113 = y2x0, Y2X0, I08 + 112 = y2x1, Y2X1 + 119 = y2x2, Y2X2, I10 + 117 = y2x3, Y2X3, I11 + 115 = y3x0, Y3X0, I12 + 116 = y3x1, Y3X1, I14 + 114 = y3x2, Y3X2, I09 + 118 = y3x3, Y3X3, I15 + 80 = I01 + 64 = Y1X1 + }; ++] + // Error registers + 104 = p0q0, E00 + 105 = p0q1, E03 + 106 = p1q0, E12 + 107 = p1q1, E15 + 108 = e0, C0, E08 + 109 = E01 + 110 = E02 + 111 = e1, C1, E11 + 112 = E13 + 113 = E14 + 114 = B0, E04 + 115 = B1, E07 + 116 = e2, E06 + 117 = e3, E10 + 118 = E05 + 119 = E09 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blk_KCPQkc, SR_CTAID.X; + + +01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 128, PT; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + + +--:-:-:-:1 MOV magic_CPQkc, param_magic_CPQkc; +--:-:-:-:1 MOV magic_PQkc, param_magic_PQkc; +--:-:-:-:1 MOV magic_Qkc, param_magic_Qkc; +--:-:-:-:1 IADD neg_CPQkc, RZ, -param_CPQkc; +--:-:-:-:1 IADD neg_PQkc, RZ, -param_PQkc; +--:-:-:-:1 IADD neg_Qkc, RZ, -param_Qkc; +--:-:-:-:1 IADD neg_kc, RZ, -param_kc; +--:-:-:-:1 IADD neg_c, RZ, -param_c; + +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_CPQkc, 1, PT; +--:-:-:-:1 ISETP.NE.AND P2, PT, magic_PQkc, 1, PT; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_Qkc, 1, PT; + +// blk_K = blk_KCPQkc / CPQkc +02:-:-:-:1 @P1 XMAD div1, blk_KCPQkc, magic_CPQkc, RZ; +--:-:-:-:1 @P1 XMAD div2, blk_KCPQkc, magic_CPQkc.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, blk_KCPQkc.H1, magic_CPQkc.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, blk_KCPQkc.H1, magic_CPQkc, div1; +--:-:-:-:1 @P1 IADD3.RS blk_K, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 blk_K, blk_K, param_shift_CPQkc; +--:-:-:-:1 @!P1 SHR.U32 blk_K, blk_KCPQkc, param_shift_CPQkc; + +// blk_CPQkc = blk_KCPQkc % CPQkc +--:-:-:-:1 XMAD.LO2 blk_CPQkc, neg_CPQkc, blk_K, blk_KCPQkc; + +// blk_C = blk_CPQkc / PQkc +--:-:-:-:1 @P2 XMAD div1, blk_CPQkc, magic_PQkc, RZ; +--:-:-:-:1 @P2 XMAD div2, blk_CPQkc, magic_PQkc.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, blk_CPQkc.H1, magic_PQkc.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, blk_CPQkc.H1, magic_PQkc, div1; +--:-:-:-:1 @P2 IADD3.RS blk_C, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 blk_C, blk_C, param_shift_PQkc; +--:-:-:-:1 @!P2 SHR.U32 blk_C, blk_CPQkc, param_shift_PQkc; + +// blk_PQkc = blk_CPQkc % PQkc +--:-:-:-:1 XMAD.LO2 blk_PQkc, neg_PQkc, blk_C, blk_CPQkc; + +// blk_P = blk_PQkc / Qkc +--:-:-:-:1 @P3 XMAD div1, blk_PQkc, magic_Qkc, RZ; +--:-:-:-:1 @P3 XMAD div2, blk_PQkc, magic_Qkc.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, blk_PQkc.H1, magic_Qkc.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, blk_PQkc.H1, magic_Qkc, div1; +--:-:-:-:1 @P3 IADD3.RS blk_P, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 blk_P, blk_P, param_shift_Qkc; +--:-:-:-:1 @!P3 SHR.U32 blk_P, blk_PQkc, param_shift_Qkc; + +// blk_Qkc = blk_PQkc % Qkc +--:-:-:-:1 XMAD.LO2 blk_Qkc, neg_Qkc, blk_P, blk_PQkc; + +// blk_Q = blk_Qkc / kc +--:-:-:-:1 XMAD.LO2C blk_Q, blk_Qkc, param_magic_kc, RZ; +--:-:-:-:1 SHR.U32 blk_Q, blk_Q, param_shift_kc; +// blk_kc = blk_Qkc % kc +--:-:-:-:1 XMAD.S16.U16 blk_kc, neg_kc, blk_Q, blk_Qkc; + +// blk_k = blk_kc / c +--:-:-:-:1 XMAD blk_k, blk_kc, param_magic_c, RZ; +--:-:-:-:1 SHR.U32 blk_k, blk_k, param_shift_c; +// blk_c = blk_kc % c +--:-:-:-:1 XMAD.S16.U16 blk_c, neg_c, blk_k, blk_kc; + +// blk_K = blk_K*param_k + blk_k +--:-:-:-:1 XMAD blk_K, blk_K, param_k, blk_k; +// blk_C = blk_C*param_c + blk_c +--:-:-:-:1 XMAD blk_C, blk_C, param_c, blk_c; + +// Spill these block constants to shared +--:-:-:-:1 ISETP.EQ.AND P5, PT, tid, RZ, PT; +--:-:-:-:1 @P5 STS [addr_blk_K], blk_K; +--:-:-:-:1 @P5 STS [addr_blk_C], blk_C; +--:-:-:-:1 @P5 STS [addr_blk_P], blk_P; +--:-:-:-:1 @P5 STS [addr_blk_Q], blk_Q; + +// gxs = blk_Q +// gys = blk_P +--:-:-:-:1 MOV gxs, blk_Q; +--:-:-:-:1 MOV gys, blk_P; + +[+ + our $IX; + return $IX ? '' : q{ +--:-:-:-:1 BFE.U32 n, tid, param_superN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +// tidX = (tid & 127) >> 2 +// tidY = tid & 3 +// writeS = tidY*512 + tidX + (tidY << 3) +--:-:-:-:1 BFE.U32 tidX, tid, 0x502; // 5 bits at position 2 +--:-:-:-:1 LOP.AND tidY, tid, 3; +--:-:-:-:1 ISCADD writeS, tidY, tidX, 9; +--:-:-:-:1 ISCADD writeS, tidY, writeS, 3; +--:-:-:-:1 SHL writeS, writeS, 2; + }; ++] + +// readEs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readIs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readIs, tid, 8; +--:-:-:-:1 SHR.U32 readIs, readIs, 2; +--:-:-:-:1 LOP3.LUT readIs, readIs, tid16, tid1, 0xfe; +--:-:-:-:1 SHL readIs, readIs, 4; + +--:-:-:-:1 BFE.U32 readEs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readEs, readEs, tid16; +--:-:-:-:1 ISCADD readEs, readEs, 4x<512*4 + 32>, 4; + + +--:-:-:-:5 @P0 BRA.U ERROR_SETUP; + +[+ + our ($IX, $dtype_shift); + return $IX ? qq{ + +--:-:-:-:1 MOV swapBuffer, 4x<(512*4 + 32)*2>; + +// tidY = (tid & 127) / 32 +--:-:-:-:1 BFE.U32 tidY, tid, 0x205; // 2 bits at position 5 +--:-:-:-:1 BFE.U32 n, tid, param_superNI; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +// writeS = (tidY*512 + (tid & 31)*4)*4 +--:-:-:-:1 LOP.AND tidX, tid, 31; +--:-:-:-:1 SHL writeS, tidX, 4; +--:-:-:-:1 ISCADD writeS, tidY, writeS, 11; +// offsetI = I + (tid & 31)*4 +--:-:-:-:1 LEA offsetI0.CC, tidX, param_I[0], 1x<$dtype_shift + 2>; +--:-:-:-:1 LEA.HI.X offsetI1, tidX, param_I[1], RZ, 1x<$dtype_shift + 2>; + + + } : ''; ++] + +--:-:-:-:0 MOV blkC, blk_C; + +// IMAGE_SETUP +--:-:-:-:5 CAL IMAGE_OFFSET; +--:-:-:-:5 CAL IMAGE_LOAD; +--:-:-:-:5 CAL IMAGE_OFFSET; + +[+ + our ($convert_in, $IX); + if ($convert_in) + { + my $out = $IX ? qq{ +02:-:-:-:1 $convert_in I03, I01.H1; +--:-:-:-:1 $convert_in I02, I01.H0; +--:-:-:-:1 $convert_in I01, I00.H1; +--:-:2:-:1 $convert_in I00, I00.H0; + +04:-:-:-:1 $convert_in I13, I11.H1; +--:-:-:-:1 $convert_in I12, I11.H0; +--:-:-:-:1 $convert_in I11, I10.H1; +--:-:3:-:1 $convert_in I10, I10.H0; + +08:-:-:-:1 $convert_in I23, I21.H1; +--:-:-:-:1 $convert_in I22, I21.H0; +--:-:-:-:1 $convert_in I21, I20.H1; +--:-:4:-:1 $convert_in I20, I20.H0; + +10:-:-:-:1 $convert_in I33, I31.H1; +--:-:-:-:1 $convert_in I32, I31.H0; +--:-:-:-:1 $convert_in I31, I30.H1; +--:-:5:-:1 $convert_in I30, I30.H0; + } : qq{ +02:-:-:-:1 $convert_in y0x0, y0x0; +--:-:-:-:1 $convert_in y0x1, y0x1; +--:-:-:-:1 $convert_in y0x2, y0x2; +--:-:2:-:1 $convert_in y0x3, y0x3; + +04:-:-:-:1 $convert_in y2x0, y2x0; +--:-:-:-:1 $convert_in y2x1, y2x1; +--:-:-:-:1 $convert_in y2x2, y2x2; +--:-:3:-:1 $convert_in y2x3, y2x3; + +08:-:-:-:1 $convert_in y1x0, y1x0; +--:-:-:-:1 $convert_in y1x1, y1x1; +--:-:-:-:1 $convert_in y1x2, y1x2; +--:-:4:-:1 $convert_in y1x3, y1x3; + +10:-:-:-:1 $convert_in y3x0, y3x0; +--:-:-:-:1 $convert_in y3x1, y3x1; +--:-:-:-:1 $convert_in y3x2, y3x2; +--:-:5:-:1 $convert_in y3x3, y3x3; + }; + return qq{ + + +$out + +--:-:-:-:1 NOP; # we need 20 total conversions. that's 4 short of instruction 2 cache lines +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; +--:-:-:-:1 NOP; + + }; + } + return ''; ++] + +[+ + our $IX; + return $IX ? q{ +02:-:-:-:1 STS.128 [writeS + 4x<00*4>], I0; +04:-:-:-:1 STS.128 [writeS + 4x<32*4>], I1; +08:-:-:-:1 STS.128 [writeS + 4x<64*4>], I2; +10:-:-:-:1 STS.128 [writeS + 4x<96*4>], I3; + +// init = bNextY ? 1 : 0 +--:-:-:-:0 SEL pred_bits, RZ, 1, !P6; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuffer; +--:-:-:-:0 IADD swapBuffer, RZ, -swapBuffer; + +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>]; + +--:-:-:-:5 CAL IMAGE_LOAD; + +// init += bNextY ? 1 : 0 +--:-:-:-:0 @P6 IADD pred_bits, pred_bits, 1; + +--:-:-:-:5 CAL IMAGE_OFFSET; +--:-:-:-:5 BRA.U IMAGE_LOOP; + } : q{ + + +06:-:-:-:1 FADD Y0X0, y0x0, -y2x0; +--:-:-:-:1 FADD Y0X1, y0x1, -y2x1; +--:-:-:-:1 FADD Y0X2, y0x2, -y2x2; +--:-:-:-:1 FADD Y0X3, y0x3, -y2x3; +--:-:-:-:1 FADD I00, Y0X0, -Y0X2; +--:-:-:-:1 FADD I03, -Y0X1, Y0X3; +--:-:-:-:1 FADD I01, Y0X1, Y0X2; +--:-:-:-:1 FADD I02, Y0X2, -Y0X1; +--:-:-:-:1 STS [writeS + 4x<32*00>], I00; +--:-:-:-:1 STS [writeS + 4x<32*03>], I03; +--:-:-:-:1 STS [writeS + 4x<32*01>], I01; +--:6:-:-:1 STS [writeS + 4x<32*02>], I02; +18:-:-:-:1 FADD Y3X0, -y1x0, y3x0; +--:-:-:-:1 FADD Y3X1, -y1x1, y3x1; +--:-:-:-:1 FADD Y3X2, -y1x2, y3x2; +--:-:-:-:1 FADD Y3X3, -y1x3, y3x3; +--:-:-:-:1 FADD I12, Y3X0, -Y3X2; +--:-:-:-:1 FADD I15, -Y3X1, Y3X3; +--:-:-:-:1 FADD I13, Y3X1, Y3X2; +--:-:-:-:1 FADD I14, Y3X2, -Y3X1; +--:-:-:-:1 STS [writeS + 4x<32*12>], I12; +--:-:-:-:1 STS [writeS + 4x<32*15>], I15; +--:-:-:-:1 STS [writeS + 4x<32*13>], I13; +--:-:-:-:1 STS [writeS + 4x<32*14>], I14; +20:-:-:-:1 FADD Y1X0, y1x0, y2x0; +--:-:-:-:1 FADD Y1X1, y1x1, y2x1; +--:-:-:-:1 FADD Y1X2, y1x2, y2x2; +--:-:-:-:1 FADD Y1X3, y1x3, y2x3; +--:-:-:-:1 FADD Y2X0, y2x0, -y1x0; +--:-:-:-:1 FADD Y2X1, y2x1, -y1x1; +--:-:-:-:1 FADD Y2X2, y2x2, -y1x2; +--:-:-:-:1 FADD Y2X3, y2x3, -y1x3; +--:-:-:-:1 FADD I04, Y1X0, -Y1X2; +--:-:-:-:1 FADD I05, Y1X1, Y1X2; +--:-:-:-:1 FADD I06, Y1X2, -Y1X1; +--:-:-:-:1 FADD I07, -Y1X1, Y1X3; +--:-:-:-:1 STS [writeS + 4x<32*04>], I04; +--:-:-:-:1 STS [writeS + 4x<32*05>], I05; +--:-:-:-:1 STS [writeS + 4x<32*06>], I06; +--:-:-:-:1 STS [writeS + 4x<32*07>], I07; +--:-:-:-:1 FADD I08, Y2X0, -Y2X2; +--:-:-:-:1 FADD I11, -Y2X1, Y2X3; +--:-:-:-:1 FADD I09, Y2X1, Y2X2; +--:-:-:-:1 FADD I10, Y2X2, -Y2X1; +--:-:-:-:1 STS [writeS + 4x<32*08>], I08; +--:-:-:-:1 STS [writeS + 4x<32*11>], I11; +--:-:-:-:1 STS [writeS + 4x<32*09>], I09; +--:-:-:-:1 STS [writeS + 4x<32*10>], I10; + + + +// init = bNextY ? 1 : 0 +--:-:-:-:0 SEL init, RZ, 1, !P6; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 IADD writeS, writeS, 4x<(512*4 + 32)*2>; + +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>]; + +--:-:-:-:5 CAL IMAGE_LOAD; + +// init += bNextY ? 1 : 0 +--:-:-:-:0 @P6 IADD init, init, 1; +--:-:-:-:5 CAL IMAGE_OFFSET; +--:-:-:-:0 BFI pred_bits, init, 0x214, pred_bits; // 2 bits at position 20 +--:-:-:-:5 BRA.U IMAGE_LOOP; + }; ++] + + +IMAGE_OFFSET: + + +[+ + our ($dtype_shift, $IX); + return $IX ? qq{ + +--:-:-:-:1 BFE.U32 super_x, tid, param_superXI; +--:-:-:-:1 BFE.U32 super_y, tid, param_superYI; +--:-:-:-:1 SHL gx, gxs, param_shiftXI; +--:-:-:-:1 SHL gy, gys, param_shiftYI; +--:-:-:-:1 IADD gx, gx, super_x; +--:-:-:-:1 IADD gy, gy, super_y; + +--:-:-:-:1 ISETP.LT.AND P0, PT, gx, param_GX, P4; +--:-:-:-:1 ISETP.LT.AND P0, PT, gy, param_GY, P0; + +// offset = blkC*GY*GX*N + gy*GX*N + gx*N + n +--:-:-:-:1 XMAD.U16.U16 offset, gx, param_N, n; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, gy, param_XN, offset; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, blkC, param_YXN, offset; + +// trackI = offsetI + offset*512 +20:-:-:-:1 LEA trackI0.CC, offset, offsetI0, 1x<$dtype_shift + 9>; +--:-:-:-:0 LEA.HI.X trackI1, offset, offsetI1, RZ, 1x<$dtype_shift + 9>; + } : qq{ +// Calc superblock coordinates +01:-:-:-:1 SHL x, gxs, param_shiftX; +--:-:-:-:1 SHL y, gys, param_shiftY; + +// Calc this thread's sub-block coordinates +--:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 super_y, tid, param_superY; +--:-:-:-:1 ISCADD x, super_x, x, 1; +--:-:-:-:1 ISCADD y, super_y, y, 1; + +// Apply padding +--:-:-:-:1 IADD x, x, -param_pad_x; +--:-:-:-:1 IADD y, y, -param_pad_y; + +// c = blkC*32 + tidX +--:-:-:-:1 BFE.U32 tid_X, tid, 0x502; // 5 bits at position 2 +--:-:-:-:1 ISCADD c, blkC, tid_X, 5; +--:-:-:-:1 ISETP.LT.AND P4, PT, c, param_C, P4; + +// offset = c*YXN + y*XN + x*N + n +--:-:-:-:1 XMAD.S16.U16 offset, x, param_N, n; +--:-:-:-:1 XMAD.S16.U16.LO2C offset, y, param_XN, offset; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, c, param_YXN, offset; +--:-:-:-:1 ISET.LT.AND offsign, offset, RZ, PT; + +20:-:-:-:1 LEA track00.CC, offset, param_I[0], $dtype_shift; +--:-:-:-:1 IADD.X track01, offsign, param_I[1]; +--:-:-:-:1 IADD track10.CC, track00, param_Np; +--:-:-:-:1 IADD.X track11, track01, RZ; +--:-:-:-:1 IADD track20.CC, track10, param_Np; +--:-:-:-:1 IADD.X track21, track11, RZ; +--:-:-:-:1 IADD track30.CC, track20, param_Np; +--:-:-:-:1 IADD.X track31, track21, RZ; + +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 IADD x2, x, 2; +--:-:-:-:1 IADD x3, x, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_X, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_X, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_X, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_X, P4; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x0f; + +--:-:-:-:1 IADD x1, y, 1; +--:-:-:-:1 IADD x2, y, 2; +--:-:-:-:1 IADD x3, y, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, y, param_Y, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_Y, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_Y, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_Y, P4; +--:-:-:-:1 ISETP.GE.AND P0, PT, y, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; + +--:-:-:-:1 SEL pred_bits, mask_x, RZ, P0; +--:-:-:-:1 \@P1 BFI pred_bits, mask_x, 0x404, pred_bits; +--:-:-:-:1 \@P2 BFI pred_bits, mask_x, 0x408, pred_bits; +--:-:-:-:1 \@P3 BFI pred_bits, mask_x, 0x40c, pred_bits; + +// Cache y preds in high bits +--:-:-:-:1 P2R mask_y, PR, RZ, 0x0f; +--:-:-:-:0 BFI pred_bits, mask_y, 0x410, pred_bits; // 4 bits at position 16 + }; ++] + + +--:-:-:-:5 RET; + +IMAGE_LOAD: + + + +[+ + our ($dtype, $dtype_shift, $IX, $vec_size, $dtype_size); + return $IX ? qq{ + +--:-:2:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero]; +--:-:3:-:1 \@!P0 LDS.U.$vec_size I1, [addr_zero]; +--:-:4:-:1 \@!P0 LDS.U.$vec_size I2, [addr_zero]; +--:-:5:-:1 \@!P0 LDS.U.$vec_size I3, [addr_zero]; + +--:-:2:-:1 \@P0 LDG.E.CG.$vec_size I0, [trackI + 4x<00 * $dtype_size>]; +--:-:3:-:1 \@P0 LDG.E.CG.$vec_size I1, [trackI + 4x<32 * $dtype_size>]; +--:-:4:-:1 \@P0 LDG.E.CG.$vec_size I2, [trackI + 4x<64 * $dtype_size>]; +--:6:5:-:1 \@P0 LDG.E.CG.$vec_size I3, [trackI + 4x<96 * $dtype_size>]; + + } : qq{ +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 SHF.R.U64 pred_bits, pred_bits, 8, pred_bits; + +--:-:-:-:1 \@!P0 MOV y0x0, RZ; +--:-:-:-:1 \@P0 LDG.E.CI$dtype y0x0, [track0]; +--:-:-:-:1 \@!P1 MOV y0x1, RZ; +--:-:-:-:1 \@P1 LDG.E.CI$dtype y0x1, [track1]; +--:-:-:-:1 \@!P2 MOV y0x2, RZ; +--:-:-:-:1 \@P2 LDG.E.CI$dtype y0x2, [track2]; +--:-:-:-:1 \@!P3 MOV y0x3, RZ; +--:6:2:-:1 \@P3 LDG.E.CI$dtype y0x3, [track3]; +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 SHF.L.U64 pred_bits, pred_bits, 4, pred_bits; +20:-:-:-:1 IADD track00.CC, track00, param_2XNp; +--:-:-:-:1 IADD.X track01, track01, RZ; +--:-:-:-:1 IADD track10.CC, track10, param_2XNp; +--:-:-:-:1 IADD.X track11, track11, RZ; +--:-:-:-:1 IADD track20.CC, track20, param_2XNp; +--:-:-:-:1 IADD.X track21, track21, RZ; +--:-:-:-:1 IADD track30.CC, track30, param_2XNp; +--:-:-:-:1 IADD.X track31, track31, RZ; + +--:-:-:-:1 \@!P0 MOV y2x0, RZ; +--:-:-:-:1 \@P0 LDG.E.CI$dtype y2x0, [track0]; +--:-:-:-:1 \@!P1 MOV y2x1, RZ; +--:-:-:-:1 \@P1 LDG.E.CI$dtype y2x1, [track1]; +--:-:-:-:1 \@!P2 MOV y2x2, RZ; +--:-:-:-:1 \@P2 LDG.E.CI$dtype y2x2, [track2]; +--:-:-:-:1 \@!P3 MOV y2x3, RZ; +--:6:3:-:1 \@P3 LDG.E.CI$dtype y2x3, [track3]; +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 SHF.R.U64 pred_bits, pred_bits, 8, pred_bits; +20:-:-:-:1 IADD track00.CC, track00, -param_XNp; +--:-:-:-:1 IADD.X track01, track01, -RZ; +--:-:-:-:1 IADD track10.CC, track10, -param_XNp; +--:-:-:-:1 IADD.X track11, track11, -RZ; +--:-:-:-:1 IADD track20.CC, track20, -param_XNp; +--:-:-:-:1 IADD.X track21, track21, -RZ; +--:-:-:-:1 IADD track30.CC, track30, -param_XNp; +--:-:-:-:1 IADD.X track31, track31, -RZ; + +--:-:-:-:1 \@!P0 MOV y1x0, RZ; +--:-:-:-:1 \@P0 LDG.E.CI$dtype y1x0, [track0]; +--:-:-:-:1 \@!P1 MOV y1x1, RZ; +--:-:-:-:1 \@P1 LDG.E.CI$dtype y1x1, [track1]; +--:-:-:-:1 \@!P2 MOV y1x2, RZ; +--:-:-:-:1 \@P2 LDG.E.CI$dtype y1x2, [track2]; +--:-:-:-:1 \@!P3 MOV y1x3, RZ; +--:6:4:-:1 \@P3 LDG.E.CI$dtype y1x3, [track3]; +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 SHF.L.U64 pred_bits, pred_bits, 12, pred_bits; +20:-:-:-:1 IADD track00.CC, track00, param_2XNp; +--:-:-:-:1 IADD.X track01, track01, RZ; +--:-:-:-:1 IADD track10.CC, track10, param_2XNp; +--:-:-:-:1 IADD.X track11, track11, RZ; +--:-:-:-:1 IADD track20.CC, track20, param_2XNp; +--:-:-:-:1 IADD.X track21, track21, RZ; +--:-:-:-:1 IADD track30.CC, track30, param_2XNp; +--:-:-:-:1 IADD.X track31, track31, RZ; + +--:-:-:-:1 \@!P0 MOV y3x0, RZ; +--:-:-:-:1 \@P0 LDG.E.CI$dtype y3x0, [track0]; +--:-:-:-:1 \@!P1 MOV y3x1, RZ; +--:-:-:-:1 \@P1 LDG.E.CI$dtype y3x1, [track1]; +--:-:-:-:1 \@!P2 MOV y3x2, RZ; +--:-:-:-:1 \@P2 LDG.E.CI$dtype y3x2, [track2]; +--:-:-:-:1 \@!P3 MOV y3x3, RZ; +--:6:5:-:1 \@P3 LDG.E.CI$dtype y3x3, [track3]; + }; ++] + + +// Advance offset/preds +--:-:-:-:1 IADD n, n, param_loopN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +--:-:-:-:1 @!P4 BFE.U32 n, tid, param_superNI; +--:-:-:-:1 @!P4 IADD gxs, gxs, param_strideX; + +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, PT; + +--:-:-:-:1 @!P5 MOV gxs, blk_Q; +--:-:-:-:1 @!P5 IADD gys, gys, param_strideY; + +--:-:-:-:1 ISETP.LT.AND P6, PT, gys, param_GYS, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; + + +--:-:-:-:5 RET; + +ERROR_SETUP: + +[+ + our $IX; + return $IX ? q{ + +--:-:-:-:1 BFE.U32 n, tid, param_superN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +// tidX = (tid & 127) >> 2 +// tidY = tid & 3 +// writeS = tidY*512 + tidX + (tidY << 3) +--:-:-:-:1 BFE.U32 tidX, tid, 0x502; // 5 bits at position 2 +--:-:-:-:1 LOP.AND tidY, tid, 3; +--:-:-:-:1 ISCADD writeS, tidY, tidX, 9; +--:-:-:-:1 ISCADD writeS, tidY, writeS, 3; +--:-:-:-:1 SHL writeS, writeS, 2; + + } : ''; ++] + +--:-:-:-:0 MOV blkK, blk_K; + +--:-:-:-:5 CAL ERROR_OFFSET; +--:-:-:-:5 CAL ERROR_LOAD; +--:-:-:-:5 CAL ERROR_OFFSET; + + +[+ + our ($convert_in); + return $convert_in ? qq{ + +02:-:2:-:1 $convert_in p0q0, p0q0; +04:-:3:-:1 $convert_in p0q1, p0q1; +08:-:4:-:1 $convert_in p1q1, p1q1; +10:-:5:-:1 $convert_in p1q0, p1q0; + + } : ''; ++] + + +02:-:-:-:1 FMUL e0, p0q0, 0.5; +04:-:-:-:1 FFMA E01, p0q1, 0.5, e0; +--:-:-:-:1 FFMA E02, p0q1, -0.5, e0; +08:-:-:-:1 FMUL e1, p1q1, 0.5; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*00 + 32>], E00; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*01 + 32>], E01; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*02 + 32>], E02; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*03 + 32>], E03; +10:-:-:-:1 FFMA E13, p1q0, 0.5, e1; +--:-:-:-:1 FFMA E14, p1q0, 0.5, -e1; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*12 + 32>], E12; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*15 + 32>], E15; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*13 + 32>], E13; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*14 + 32>], E14; +--:-:-:-:1 FFMA B0, p1q0, 0.5, e0; +--:-:-:-:1 FFMA C0, p1q0, -0.5, e0; +--:-:-:-:1 FFMA B1, p0q1, 0.5, e1; +--:-:-:-:1 FFMA C1, p0q1, 0.5, -e1; +--:-:-:-:1 FMUL e2, B0, 0.5; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*04 + 32>], E04; +--:-:-:-:1 FMUL e3, C0, 0.5; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*08 + 32>], E08; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*07 + 32>], E07; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*11 + 32>], E11; +--:-:-:-:1 FFMA E05, B1, 0.5, e2; +--:-:-:-:1 FFMA E06, B1, -0.5, e2; +--:-:-:-:1 FFMA E09, C1, 0.5, e3; +--:-:-:-:1 FFMA E10, C1, -0.5, e3; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*05 + 32>], E05; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*06 + 32>], E06; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*09 + 32>], E09; +--:-:-:-:1 STS [writeS + 4x<512*4 + 32*10 + 32>], E10; + + + + +// init = bNextY ? 1 : 0 +--:-:-:-:0 SEL init, RZ, 1, !P6; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 IADD writeS, writeS, 4x<(512*4 + 32)*2>; + +--:-:-:-:1 LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>]; +--:-:-:-:1 LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>]; +--:-:1:-:1 LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>]; + +--:-:-:-:5 CAL ERROR_LOAD; + +// init += bNextY ? 1 : 0 +--:-:-:-:0 @P6 IADD init, init, 1; +--:-:-:-:5 CAL ERROR_OFFSET; +--:-:-:-:0 BFI pred_bits, init, 0x208, pred_bits; // 2 bits at position 8 +--:-:-:-:5 BRA.U ERROR_LOOP; + +ERROR_OFFSET: + + +// Calc superblock coordinates +01:-:-:-:1 SHL x, gxs, param_shiftX; +--:-:-:-:1 SHL y, gys, param_shiftY; + +// Calc this thread's sub-block coordinates +--:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 super_y, tid, param_superY; +--:-:-:-:1 ISCADD x, super_x, x, 1; +--:-:-:-:1 ISCADD y, super_y, y, 1; + +// k = blkK*32 + tidX (have k share register with c) +--:-:-:-:1 BFE.U32 tid_X, tid, 0x502; // 5 bits at position 2 +--:-:-:-:1 ISCADD c, blkK, tid_X, 5; +--:-:-:-:1 ISETP.LT.AND P4, PT, c, param_K, P4; + +// offset0 = k*PQN + y*QN + x*N + n +// offset1 = offset0 + N +// offset2 = offset0 + QN +// offset3 = offset1 + QN +--:-:-:-:1 XMAD.S16.U16 offset, x, param_N, n; +--:-:-:-:1 XMAD.S16.U16.LO2C offset, y, param_QN, offset; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, c, param_PQN, offset; + +20:-:-:-:1 LEA track00.CC, offset, param_E[0], [+ dtype_shift() +]; +--:-:-:-:1 IADD.X track01, RZ, param_E[1]; +--:-:-:-:1 IADD track10.CC, track00, param_Np; +--:-:-:-:1 IADD.X track11, track01, RZ; +--:-:-:-:1 IADD track20.CC, track00, param_QNp; +--:-:-:-:1 IADD.X track21, track01, RZ; +--:-:-:-:1 IADD track30.CC, track10, param_QNp; +--:-:-:-:0 IADD.X track31, track11, RZ; + +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 IADD x2, y, 1; + +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_Q, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, y, param_P, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, x2, param_P, P4; + +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, y, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x2, RZ, P3; + +--:-:-:-:1 P2R mask_x, PR, RZ, 0x03; +--:-:-:-:1 P2R mask_y, PR, RZ, 0x0c; + +--:-:-:-:1 SEL pred_bits, mask_x, RZ, P2; +--:-:-:-:1 @P3 BFI pred_bits, mask_x, 0x202, pred_bits; // 2 bits at position 2 + +// Cache y preds in high bits +--:-:-:-:0 BFI pred_bits, mask_y, 0x404, pred_bits; // 4 bits at position 4 + + +--:-:-:-:5 RET; + +ERROR_LOAD: + + + +--:-:-:-:1 R2P PR, pred_bits, 0x0f; +--:-:-:-:1 @!P0 MOV p0q0, RZ; +--:-:2:-:1 @P0 LDG.E.CI[+ dtype() +] p0q0, [track0]; +--:-:-:-:1 @!P1 MOV p0q1, RZ; +--:-:3:-:1 @P1 LDG.E.CI[+ dtype() +] p0q1, [track1]; +--:-:-:-:1 @!P3 MOV p1q1, RZ; +--:-:4:-:1 @P3 LDG.E.CI[+ dtype() +] p1q1, [track3]; +--:-:-:-:1 @!P2 MOV p1q0, RZ; +--:6:5:-:1 @P2 LDG.E.CI[+ dtype() +] p1q0, [track2]; + + + +// Advance offset/preds +--:-:-:-:1 IADD n, n, param_loopN; +--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, PT; + +--:-:-:-:1 @!P4 BFE.U32 n, tid, param_superN; +--:-:-:-:1 @!P4 IADD gxs, gxs, param_strideX; + +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, PT; + +--:-:-:-:1 @!P5 MOV gxs, blk_Q; +--:-:-:-:1 @!P5 IADD gys, gys, param_strideY; + +--:-:-:-:1 ISETP.LT.AND P6, PT, gys, param_GYS, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; + + +--:-:-:-:5 RET; + + +IMAGE_LOOP: + +[+ + our ($dtype, $dtype_shift, $dtype_size, $vec_size, $convert_in, $IX); + my %insert = ( + + $IX ? ( + + j0c8 => "--:-:-:-:1 ISETP.LT.AND P0, PT, gx, param_GX, P6;\n", + j0c20 => "--:-:-:-:1 ISETP.LT.AND P0, PT, gy, param_GY, P0;\n", + + j1c10 => "20:-:-:-:1 \@P0 LEA trackI0.CC, offset, offsetI0, 1x<$dtype_shift + 9>;\n", + j1c15 => "--:-:-:-:1 \@P0 LEA.HI.X trackI1, offset, offsetI1, RZ, 1x<$dtype_shift + 9>;\n", + + j1c32 => "02:2:-:-:1 STS.128 [writeS + 4x<00*4>], I0;\n", + j1c36 => "02:-:2:-:1 \@P0 LDG.E.CG.$vec_size I0, [trackI + 4x<00 * $dtype_size>];\n", + j1c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];\n", + + j1c56 => "04:3:-:-:1 STS.128 [writeS + 4x<32*4>], I1;\n", + j1c60 => "04:-:3:-:1 \@P0 LDG.E.CG.$vec_size I1, [trackI + 4x<32 * $dtype_size>];\n", + j1c62 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I1, [addr_zero];\n", + + + j2c32 => "08:4:-:-:1 STS.128 [writeS + 4x<64*4>], I2;\n", + j2c36 => "08:-:4:-:1 \@P0 LDG.E.CG.$vec_size I2, [trackI + 4x<64 * $dtype_size>];\n", + j2c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I2, [addr_zero];\n", + + + j2c56 => "10:5:-:-:1 STS.128 [writeS + 4x<96*4>], I3;\n", + j2c60 => "10:6:5:-:1 \@P0 LDG.E.CG.$vec_size I3, [trackI + 4x<96 * $dtype_size>];\n", + j2c62 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I3, [addr_zero];\n", + + $convert_in ? ( + j1c16 => "02:-:-:-:1 $convert_in I03, I01.H1;\n", + j1c20 => "--:-:-:-:1 $convert_in I02, I01.H0;\n", + j1c24 => "--:-:-:-:1 $convert_in I01, I00.H1;\n", + j1c28 => "--:-:2:-:1 $convert_in I00, I00.H0;\n", + + j1c40 => "04:-:-:-:1 $convert_in I13, I11.H1;\n", + j1c44 => "--:-:-:-:1 $convert_in I12, I11.H0;\n", + j1c48 => "--:-:-:-:1 $convert_in I11, I10.H1;\n", + j1c52 => "--:-:3:-:1 $convert_in I10, I10.H0;\n", + + j2c16 => "08:-:-:-:1 $convert_in I23, I21.H1;\n", + j2c20 => "--:-:-:-:1 $convert_in I22, I21.H0;\n", + j2c24 => "--:-:-:-:1 $convert_in I21, I20.H1;\n", + j2c28 => "--:-:4:-:1 $convert_in I20, I20.H0;\n", + + j2c40 => "10:-:-:-:1 $convert_in I33, I31.H1;\n", + j2c44 => "--:-:-:-:1 $convert_in I32, I31.H0;\n", + j2c48 => "--:-:-:-:1 $convert_in I31, I30.H1;\n", + j2c52 => "--:-:5:-:1 $convert_in I30, I30.H0;\n", + ) : (), + + j2c63 => "--:-:-:-:1 IADD n, n, param_loopN;\n" . + "--:-:-:-:0 IADD offset, offset, param_loopN;\n". + "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuffer;\n" . + "--:-:-:-:1 IADD readEs, readEs, -swapBuffer;\n" . + "--:-:-:-:1 IADD writeS, writeS, swapBuffer;\n" . + "--:-:-:-:1 IADD swapBuffer, RZ, -swapBuffer;\n", + + j3c8 => "--:-:-:-:1 PSETP.OR.AND P4, PT, P5, P6, PT;\n", + j3c21 => "--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, P4;\n", + + j3c34 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n", + + j3c63 => "--:-:-:Y:5 \@P4 BRA.U IMAGE_LOOP;\n", + + ) : ( + + $convert_in ? ( + j0c37 => "02:-:-:-:1 $convert_in y0x0, y0x0;\n", + j0c41 => "--:-:-:-:1 $convert_in y0x1, y0x1;\n", + j0c45 => "--:-:-:-:1 $convert_in y0x2, y0x2;\n", + j0c49 => "--:-:2:-:1 $convert_in y0x3, y0x3;\n", + + j0c53 => "04:-:-:-:1 $convert_in y2x0, y2x0;\n", + j0c57 => "--:-:-:-:1 $convert_in y2x1, y2x1;\n", + j0c61 => "--:-:-:-:1 $convert_in y2x2, y2x2;\n", + j1c1 => "--:-:3:-:1 $convert_in y2x3, y2x3;\n", + + j1c5 => "08:-:-:-:1 $convert_in y1x0, y1x0;\n", + j1c10 => "--:-:-:-:1 $convert_in y1x1, y1x1;\n", + j1c14 => "--:-:-:-:1 $convert_in y1x2, y1x2;\n", + j1c16 => "--:-:4:-:1 $convert_in y1x3, y1x3;\n", + + j1c21 => "10:-:-:-:1 $convert_in y3x0, y3x0;\n", + j1c23 => "--:-:-:-:1 $convert_in y3x1, y3x1;\n", + j1c27 => "--:-:-:-:1 $convert_in y3x2, y3x2;\n", + j1c29 => "--:-:5:-:1 $convert_in y3x3, y3x3;\n", + ) : (), + + j1c22 => "06:-:-:-:1 FADD Y0X0, y0x0, -y2x0;\n" . + "--:-:-:-:1 FADD Y0X1, y0x1, -y2x1;\n", + + j1c24 => "--:-:-:-:1 FADD Y0X2, y0x2, -y2x2;\n" . + "--:-:-:-:1 FADD Y0X3, y0x3, -y2x3;\n", + + j1c28 => "--:-:-:-:1 FADD I00, Y0X0, -Y0X2;\n" . + "--:-:-:-:1 FADD I03, -Y0X1, Y0X3;\n", + j1c30 => "--:-:-:-:1 FADD I01, Y0X1, Y0X2;\n" . + "--:-:-:-:1 FADD I02, Y0X2, -Y0X1;\n", + + j1c31 => "--:-:-:-:1 STS [writeS + 4x<32*00>], I00;\n", + j1c33 => "--:-:-:-:1 STS [writeS + 4x<32*03>], I03;\n", + j1c35 => "--:-:-:-:1 STS [writeS + 4x<32*01>], I01;\n", + j1c37 => "--:2:-:-:1 STS [writeS + 4x<32*02>], I02;\n", + + j1c39 => "18:-:-:-:1 FADD Y3X0, -y1x0, y3x0;\n" . + "--:-:-:-:1 FADD Y3X1, -y1x1, y3x1;\n" . + "--:-:-:-:1 FADD Y3X2, -y1x2, y3x2;\n" . + "--:-:-:-:1 FADD Y3X3, -y1x3, y3x3;\n", + + j1c43 => "--:-:-:-:1 FADD I12, Y3X0, -Y3X2;\n" . + "--:-:-:-:1 FADD I15, -Y3X1, Y3X3;\n" . + "--:-:-:-:1 FADD I13, Y3X1, Y3X2;\n" . + "--:-:-:-:1 FADD I14, Y3X2, -Y3X1;\n", + + j1c44 => "--:-:-:-:1 STS [writeS + 4x<32*12>], I12;\n", + j1c46 => "--:-:-:-:1 STS [writeS + 4x<32*15>], I15;\n", + j1c48 => "--:-:-:-:1 STS [writeS + 4x<32*13>], I13;\n", + j1c50 => "--:-:-:-:1 STS [writeS + 4x<32*14>], I14;\n", + + j1c52 => "--:-:-:-:1 R2P PR, pred_bits, 0x0f;\n" . + "--:-:-:-:1 SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;\n", + + j1c53 => "--:-:-:-:1 \@P6 ISET.LT.AND off_sign, offset, RZ, PT;\n" . + "--:-:-:-:1 \@P6 LEA track00.CC, offset, param_I[0], $dtype_shift;\n", + + j1c58 => "--:-:-:-:1 \@P6 IADD.X track01, off_sign, param_I[1];\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track00, param_Np;\n", + + j2c18 => "--:-:-:-:1 FADD Y1X0, y1x0, y2x0;\n" . + "--:-:-:-:1 FADD Y1X1, y1x1, y2x1;\n" . + "--:-:-:-:1 FADD Y1X2, y1x2, y2x2;\n" . + "--:-:-:-:1 FADD Y1X3, y1x3, y2x3;\n" . + "--:-:-:-:1 FADD Y2X0, y2x0, -y1x0;\n" . + "--:-:-:-:1 FADD Y2X1, y2x1, -y1x1;\n" . + "--:-:-:-:1 FADD Y2X2, y2x2, -y1x2;\n" . + "--:-:-:-:1 FADD Y2X3, y2x3, -y1x3;\n" . + "--:-:-:-:1 FADD I04, Y1X0, -Y1X2;\n" . + "--:-:-:-:1 FADD I05, Y1X1, Y1X2;\n" . + "--:-:-:-:1 FADD I06, Y1X2, -Y1X1;\n" . + "--:-:-:-:1 FADD I07, -Y1X1, Y1X3;\n", + + j2c19 => "--:-:-:-:1 STS [writeS + 4x<32*04>], I04;\n", + j2c21 => "--:-:-:-:1 STS [writeS + 4x<32*05>], I05;\n", + j2c23 => "--:-:-:-:1 STS [writeS + 4x<32*06>], I06;\n", + j2c25 => "--:-:-:-:1 STS [writeS + 4x<32*07>], I07;\n", + + j2c27 => "--:-:-:-:1 \@P6 IADD.X track11, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track10, param_Np;\n", + + j2c31 => "--:-:-:-:1 FADD I08, Y2X0, -Y2X2;\n" . + "--:-:-:-:1 FADD I11, -Y2X1, Y2X3;\n" . + "--:-:-:-:1 FADD I09, Y2X1, Y2X2;\n" . + "--:-:-:-:1 FADD I10, Y2X2, -Y2X1;\n", + + j2c32 => "--:-:-:-:1 STS [writeS + 4x<32*08>], I08;\n", + j2c34 => "--:-:-:-:1 STS [writeS + 4x<32*11>], I11;\n", + j2c36 => "--:-:-:-:1 STS [writeS + 4x<32*09>], I09;\n", + j2c38 => "--:-:-:-:1 STS [writeS + 4x<32*10>], I10;\n", + + j2c40 => "--:-:-:-:1 \@P6 IADD.X track21, track11, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track20, param_Np;\n", + + j2c44 => "--:-:-:-:1 LOP.AND.NZ P4, RZ, pred_bits, 0x4000;\n" . + "--:-:-:-:1 LOP.XOR pred_bits, pred_bits, 0x4000;\n", + + j2c46 => "--:-:-:-:1 \@P6 IADD.X track31, track21, RZ;\n" . + "--:-:-:-:1 IADD n, n, param_loopN;\n" . + "--:-:-:-:1 IADD offset, offset, param_loopN;\n", + + j2c62 => "--:-:-:-:1 \@P4 MOV swapBuf, 4x<(512*4 + 32)*2>;\n" . + "--:-:-:-:1 \@!P4 MOV swapBuf, -4x<(512*4 + 32)*2>;\n", + + j2c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@!P0 I2I.U32.U32 y0x0, RZ;\n" . + "--:-:-:-:0 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 LDG.E.CI$dtype y0x0, [track0];\n" . + "--:-:-:-:0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@!P1 I2I.U32.U32 y0x1, RZ;\n" . + "--:-:-:-:1 \@P1 LDG.E.CI$dtype y0x1, [track1];\n", + + j3c0 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y0x2, RZ;\n", + j3c1 => "--:-:-:-:1 \@P2 LDG.E.CI$dtype y0x2, [track2];\n", + j3c2 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y0x3, RZ;\n", + j3c3 => "--:6:2:-:1 \@P3 LDG.E.CI$dtype y0x3, [track3];\n" . + "--:-:-:Y:8 R2P PR, pred_bits, 0x0f;\n" . + "20:-:-:-:1 \@P6 IADD track00.CC, track00, param_2XNp;\n" . + "--:-:-:-:1 SHF.L.U64 pred_bits, pred_bits, 4, pred_bits;\n", + + j3c7 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y2x0, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track01, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track10, param_2XNp;\n", + + j3c9 => "--:-:-:-:1 PSETP.OR.AND P4, PT, P5, P6, PT;\n", + + j3c11 => "--:-:-:-:1 \@P0 LDG.E.CI$dtype y2x0, [track0];\n" . + "--:-:-:-:0 \@P6 IADD.X track11, track11, RZ;\n" . + "--:-:-:-:1 \@!P1 I2I.U32.U32 y2x1, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track20, param_2XNp;\n", + + j3c12 => "--:-:-:-:1 \@P1 LDG.E.CI$dtype y2x1, [track1];\n", + + j3c16 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y2x2, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track21, track21, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track30, param_2XNp;\n", + + j3c17 => "--:-:-:-:1 \@P2 LDG.E.CI$dtype y2x2, [track2];\n", + + + j3c21 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y2x3, RZ;\n" . + "--:-:-:-:2 \@P6 IADD.X track31, track31, RZ;\n", + + j3c22 => "--:6:3:-:1 \@P3 LDG.E.CI$dtype y2x3, [track3];\n" . + "--:-:-:Y:8 R2P PR, pred_bits, 0x0f;\n" . + "20:-:-:-:1 \@P6 IADD track00.CC, track00, -param_XNp;\n" . + "--:-:-:-:1 SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;\n", + + j3c23 => "--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, P4;\n", + + + j3c25 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y1x0, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track01, track01, -RZ;\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track10, -param_XNp;\n", + + j3c26 => "--:-:-:-:1 \@P0 LDG.E.CI$dtype y1x0, [track0];\n", + + j3c30 => "--:-:-:-:1 \@!P1 I2I.U32.U32 y1x1, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track11, track11, -RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track20, -param_XNp;\n", + + j3c31 => "--:-:-:-:1 \@P1 LDG.E.CI$dtype y1x1, [track1];\n", + + j3c33 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n", + + j3c35 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y1x2, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track21, track21, -RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track30, -param_XNp;\n", + + j3c36 => "--:-:-:-:1 \@P2 LDG.E.CI$dtype y1x2, [track2];\n", + + j3c40 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y1x3, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track31, track31, -RZ;\n", + + j3c42 => "--:6:4:-:1 \@P3 LDG.E.CI$dtype y1x3, [track3];\n" . + "--:-:-:Y:8 R2P PR, pred_bits, 0x0f;\n" . + "20:-:-:-:1 \@P6 IADD track00.CC, track00, param_2XNp;\n" . + "--:-:-:-:1 SHF.L.U64 pred_bits, pred_bits, 12, pred_bits;\n", + + j3c46 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y3x0, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track01, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track10, param_2XNp;\n", + + j3c47 => "--:-:-:-:1 \@P0 LDG.E.CI$dtype y3x0, [track0];\n", + + j3c51 => "--:-:-:-:1 \@!P1 I2I.U32.U32 y3x1, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track11, track11, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track20, param_2XNp;\n", + + j3c52 => "--:-:-:-:1 \@P1 LDG.E.CI$dtype y3x1, [track1];\n", + + j3c56 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y3x2, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track21, track21, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track30, param_2XNp;\n", + + j3c57 => "--:-:-:-:1 \@P2 LDG.E.CI$dtype y3x2, [track2];\n", + + j3c60 => "--:-:-:-:2 \@!P3 I2I.U32.U32 y3x3, RZ;\n" . + "--:-:-:-:1 \@P6 IADD.X track31, track31, RZ;\n", + + j3c62 => "--:6:5:-:1 \@P3 LDG.E.CI$dtype y3x3, [track3];\n", + + j3c63 => "--:-:-:Y:5 \@P4 BRA.U IMAGE_LOOP;\n", + ) + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 3) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 4; + my $bankOffset = $IX ? 0 : 8; + + my ($c0, $c2, $c4, $c6) = $j == 3 && !$IX ? (4,6,8,10) : (0,2,4,6); + + $insert{"j${j}c$c0"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy0, [readIs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, $bankOffset; + $insert{"j${j}c$c2"} = sprintf "--:-:-:-:1 LDS.U.128 j%dEx0, [readEs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, 8; + $insert{"j${j}c$c4"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy4, [readIs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, $bankOffset; + $insert{"j${j}c$c6"} = sprintf "--:-:1:-:1 LDS.U.128 j%dEx4, [readEs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, 8; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1; + + my $yield = $j < 3 && $stall && ($c % 3 == 0) ? 'Y' : '-'; + + my $wait = $c == 0 ? $j == 2 && !$IX ? '03' : '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] +[+ + our $IX; + return $IX ? q{ +// Advance x offset/preds + +--:-:-:-:1 IADD gxs, gxs, param_strideX; +--:-:-:-:1 IADD offset, offset, param_loopXI; + +01:-:-:-:1 BFE.U32 super_x, tid, param_superXI; +--:-:-:-:1 SHL gx, gxs, param_shiftXI; + +--:-:-:-:1 BFE.U32 n, tid, param_superNI; + +--:-:-:Y:d ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:0 IADD gx, gx, super_x; +--:-:-:Y:5 @P5 BRA.U IMAGE_LOOP; + +// Advance y offset/preds +--:-:-:-:1 IADD gys, gys, param_strideY; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; +--:-:-:-:1 LDS gxs, [addr_blk_Q]; +--:-:-:-:1 BFE.U32 super_x, tid, param_superXI; +--:-:-:-:1 PSETP.AND.AND P5, PT, PT, PT, PT; +--:-:-:-:0 BFE.U32 super_y, tid, param_superYI; +--:-:1:-:2 LDS blkC, [addr_blk_C]; +--:-:-:-:1 ISETP.LT.AND P6, PT, gys, param_GYS, PT; + +01:-:-:-:1 SHL gx, gxs, param_shiftXI; +--:-:-:-:1 SHL gy, gys, param_shiftYI; +--:-:-:-:1 IADD gx, gx, super_x; +--:-:-:-:1 IADD gy, gy, super_y; +--:-:-:-:1 XMAD.U16.U16 offset, gx, param_N, n; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, gy, param_XN, offset; +--:-:-:-:1 XMAD.U16.U16.LO2C offset, blkC, param_YXN, offset; + +--:-:-:Y:5 @P6 BRA.U IMAGE_LOOP; + +// Set n to loop remaining times +--:-:-:-:1 LOP.AND.NZ P5, init, pred_bits, 3; +--:-:-:-:1 MOV nloop, param_loopN; +--:-:-:-:1 MOV N, param_N; +--:-:-:Y:a LOP.AND pred_bits, pred_bits, ~3; +--:-:-:-:0 VMAD.U16.U16 n, -init, nloop, N; +--:-:-:Y:5 @P5 BRA.U IMAGE_LOOP; +--:-:-:Y:5 BRA.U END_LOOP; + } : q{ +// Advance x offset/preds + +--:-:-:-:1 IADD gxs, gxs, param_strideX; +--:-:-:-:1 IADD offset, offset, param_loopX; +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:1 SHL x, gxs, param_shiftX; +01:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 n, tid, param_superN; +--:-:-:-:1 ISCADD x, super_x, x, 1; +--:-:-:-:1 IADD x, x, -param_pad_x; +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 IADD x2, x, 2; +--:-:-:-:1 IADD x3, x, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_X, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_X, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_X, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_X, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x0f; +// Extract y + init + buffer bits +--:-:-:-:1 BFE.U32 mask_y, pred_bits, 0x710; +--:-:-:-:1 R2P PR, mask_y, 0x0f; +--:-:-:-:1 SEL pred_bits, mask_x, RZ, P0; +--:-:-:-:1 @P1 BFI pred_bits, mask_x, 0x404, pred_bits; +--:-:-:-:1 @P2 BFI pred_bits, mask_x, 0x408, pred_bits; +--:-:-:-:1 @P3 BFI pred_bits, mask_x, 0x40c, pred_bits; +--:-:-:-:0 BFI pred_bits, mask_y, 0x710, pred_bits; + + +--:-:-:Y:5 @P5 BRA.U IMAGE_LOOP; + +// Advance y offset/preds +--:-:-:-:1 IADD gys, gys, param_strideY; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; +--:-:-:-:1 LDS gxs, [addr_blk_Q]; +--:-:-:-:0 BFE.U32 init, pred_bits, 0x314; +--:-:1:-:1 LDS blkC, [addr_blk_C]; +--:-:-:-:3 PSETP.AND.AND P5, PT, PT, PT, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, gys, param_GYS, PT; +--:-:-:-:5 CAL IMAGE_OFFSET; +--:-:-:-:0 BFI pred_bits, init, 0x314, pred_bits; +--:-:-:Y:5 @P6 BRA.U IMAGE_LOOP; + + +// Set n to loop remaining times +--:-:-:-:1 SHR.U32 pred_bits, init, 2; +--:-:-:-:1 MOV nloop, param_loopN; +--:-:-:-:1 MOV N, param_N; +--:-:-:Y:c LOP.AND.NZ P5, init, init, 3; +--:-:-:-:1 SHL pred_bits, pred_bits, 22; +--:-:-:-:0 VMAD.U16.U16 n, -init, nloop, N; +--:-:-:Y:5 @P5 BRA.U IMAGE_LOOP; +--:-:-:Y:5 BRA.U END_LOOP; + + }; ++] + + +ERROR_LOOP: + +[+ + our ($dtype, $convert_in, $dtype_shift, $IX); + my %insert = ( + + $convert_in ? ( + j1c13 => "02:-:2:-:1 $convert_in p0q0, p0q0;\n", + j1c17 => "04:-:3:-:1 $convert_in p0q1, p0q1;\n", + j1c21 => "08:-:4:-:1 $convert_in p1q1, p1q1;\n", + j1c25 => "10:-:5:-:1 $convert_in p1q0, p1q0;\n", + ) : (), + + j1c23 => "02:-:-:-:1 FMUL e0, p0q0, 0.5;\n", + + j1c28 => "04:-:-:-:1 FFMA E01, p0q1, 0.5, e0;\n" . + "--:-:-:-:1 FFMA E02, p0q1, -0.5, e0;\n", + + j1c29 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*00 + 32>], E00;\n", + j1c31 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*01 + 32>], E01;\n", + j1c33 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*02 + 32>], E02;\n", + j1c35 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*03 + 32>], E03;\n", + + j1c37 => "08:-:-:-:1 FMUL e1, p1q1, 0.5;\n", + + j1c42 => "10:-:-:-:1 FFMA E13, p1q0, 0.5, e1;\n" . + "--:-:-:-:1 FFMA E14, p1q0, 0.5, -e1;\n", + + j1c43 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*12 + 32>], E12;\n", + j1c45 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*15 + 32>], E15;\n", + j1c47 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*13 + 32>], E13;\n", + j1c49 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*14 + 32>], E14;\n", + + j1c51 => "--:-:-:-:1 FFMA B0, p1q0, 0.5, e0;\n" . + "--:-:-:-:1 FFMA C0, p1q0, -0.5, e0;\n" . + "--:-:-:-:1 FFMA B1, p0q1, 0.5, e1;\n" . + "--:-:-:-:1 FFMA C1, p0q1, 0.5, -e1;\n", + + j2c9 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*04 + 32>], E04;\n", + j2c11 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*08 + 32>], E08;\n", + j2c13 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*07 + 32>], E07;\n", + j2c15 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*11 + 32>], E11;\n", + + j2c17 => "--:-:-:-:1 FMUL e2, B0, 0.5;\n" . + "--:-:-:-:1 FMUL e3, C0, 0.5;\n", + + j2c21 => "--:-:-:-:1 FFMA E05, B1, 0.5, e2;\n" . + "--:-:-:-:1 FFMA E06, B1, -0.5, e2;\n" . + "--:-:-:-:1 FFMA E09, C1, 0.5, e3;\n" . + "--:-:-:-:1 FFMA E10, C1, -0.5, e3;\n", + + j2c23 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*05 + 32>], E05;\n", + j2c25 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*06 + 32>], E06;\n", + j2c27 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*09 + 32>], E09;\n", + j2c29 => "--:-:-:-:1 STS [writeS + 4x<512*4 + 32*10 + 32>], E10;\n", + + j2c32 => "--:-:-:-:1 R2P PR, pred_bits, 0x0f;\n" . + "--:-:-:-:1 \@P6 LEA track00.CC, offset, param_E[0], $dtype_shift;\n", + + j2c37 => "--:-:-:-:1 \@P6 IADD.X track01, RZ, param_E[1];\n" . + "--:-:-:-:1 \@P6 IADD track10.CC, track00, param_Np;\n", + + j2c42 => "--:-:-:-:1 \@P6 IADD.X track11, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track20.CC, track00, param_QNp;\n", + + j2c44 => "--:-:-:-:1 LOP.AND.NZ P4, RZ, pred_bits, 0x400;\n" . + "--:-:-:-:1 LOP.XOR pred_bits, pred_bits, 0x400;\n", + + j2c47 => "--:-:-:-:1 \@P6 IADD.X track21, track01, RZ;\n" . + "--:-:-:-:1 \@P6 IADD track30.CC, track10, param_QNp;\n", + + j2c52 => "--:-:-:-:1 \@P6 IADD.X track31, track11, RZ;\n", + + j2c61 => "--:-:-:-:1 \@P4 MOV swapBuf, 4x<(512*4 + 32)*2>;\n" . + "--:-:-:-:1 \@!P4 MOV swapBuf, -4x<(512*4 + 32)*2>;\n", + + j2c62 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD readEs, readEs, -swapBuf;\n" . + "--:-:-:-:1 IADD writeS, writeS, swapBuf;\n", + + j3c8 => "--:-:2:-:1 \@P0 LDG.E.CI$dtype p0q0, [track0];\n", + j3c10 => "--:-:3:-:1 \@P1 LDG.E.CI$dtype p0q1, [track1];\n", + j3c12 => "--:-:4:-:1 \@P3 LDG.E.CI$dtype p1q1, [track3];\n", + j3c14 => "--:6:5:-:1 \@P2 LDG.E.CI$dtype p1q0, [track2];\n", + + j3c15 => "--:-:-:-:1 PSETP.OR.AND P4, PT, P5, P6, PT;\n" . + "--:-:-:-:1 IADD n, n, param_loopN;\n" . + "--:-:-:-:1 IADD offset, offset, param_loopN;\n", + + j3c16 => "--:-:-:-:1 \@!P0 I2I.U32.U32 p0q0, RZ;\n", + j3c20 => "--:-:-:-:1 \@!P1 I2I.U32.U32 p0q1, RZ;\n", + j3c24 => "--:-:-:-:1 \@!P2 I2I.U32.U32 p1q0, RZ;\n", + j3c28 => "--:-:-:-:1 \@!P3 I2I.U32.U32 p1q1, RZ;\n", + + j3c25 => "--:-:-:-:1 ISETP.LT.AND P4, PT, n, param_N, P4;\n", + + + j3c38 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n", + + + j3c63 => "--:-:-:Y:5 \@P4 BRA.U ERROR_LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 3) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 4; + my $bankOffset = $IX ? 0 : 8; + + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy0, [readIs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, $bankOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 LDS.U.128 j%dEx0, [readEs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, 8; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 LDS.U.128 j%dIy4, [readIs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, $bankOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 LDS.U.128 j%dEx4, [readEs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, 8; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1; + + my $yield = $j < 3 && $stall && ($c % 3 == 0) ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +// Advance x offset/preds + +--:-:-:-:1 IADD gxs, gxs, param_strideX; +--:-:-:-:1 IADD offset, offset, param_loopX; +// Extract y + init + buffer bits +--:-:-:-:1 BFE.U32 mask_y, pred_bits, 0x704; +--:-:-:-:1 R2P PR, mask_y, 0x0c; +--:-:-:-:1 ISETP.LT.AND P5, PT, gxs, param_GXS, P6; +--:-:-:-:1 SHL x, gxs, param_shiftX; +01:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 ISCADD x, super_x, x, 1; +--:-:-:-:1 BFE.U32 n, tid, param_superN; +--:-:-:-:1 IADD x1, x, 1; +--:-:-:-:1 ISETP.LT.AND P0, PT, x, param_Q, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_Q, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, x, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x03; +--:-:-:-:1 SEL pred_bits, mask_x, RZ, P2; +--:-:-:-:1 @P3 BFI pred_bits, mask_x, 0x202, pred_bits; +--:-:-:-:0 BFI pred_bits, mask_y, 0x704, pred_bits; + + +--:-:-:Y:5 @P5 BRA.U ERROR_LOOP; + +// Advance y offset/preds +--:-:-:-:1 IADD gys, gys, param_strideY; +--:-:-:-:0 ISETP.LT.AND P4, PT, n, param_N, P6; +--:-:-:-:1 LDS gxs, [addr_blk_Q]; +--:-:-:-:0 BFE.U32 init, pred_bits, 0x308; +--:-:1:-:1 LDS blkK, [addr_blk_K]; +--:-:-:-:2 PSETP.AND.AND P5, PT, PT, PT, PT; +--:-:-:-:0 ISETP.LT.AND P6, PT, gys, param_GYS, PT; +--:-:-:-:5 CAL ERROR_OFFSET; +--:-:-:-:0 BFI pred_bits, init, 0x308, pred_bits; +--:-:-:Y:5 @P6 BRA.U ERROR_LOOP; + +// Set n to loop remaining times +--:-:-:-:1 SHR.U32 pred_bits, init, 2; +--:-:-:-:1 MOV nloop, param_loopN; +--:-:-:-:1 MOV N, param_N; +--:-:-:Y:c LOP.AND.NZ P5, init, init, 3; +--:-:-:-:1 SHL pred_bits, pred_bits, 10; +--:-:-:-:0 VMAD.U16.U16 n, -init, nloop, N; +--:-:-:Y:5 @P5 BRA.U ERROR_LOOP; + +END_LOOP: + +// K_blk, C_blk, P_blk, Q_blk +--:-:1:-:1 LDS.U.128 blkKCPQ, [addr_blk_K]; + + +--:-:-:-:1 MOV alpha, param_alpha; + +// Strip double buffering offsets, and the batch dimension on readIs +// This gives us the shared memory write mapping for the thread's registers: +// readEs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readIs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid_16, tid, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; + +--:-:-:-:1 LOP.AND tid_1, tid, 1; +--:-:-:-:1 LOP.AND readIs, tid, 8; +--:-:-:-:1 SHR.U32 readIs, readIs, 2; +--:-:-:-:1 LOP.OR readIs, readIs, tid_1; +--:-:-:-:1 SHL readIs, readIs, 4; + +--:-:-:-:1 BFE.U32 readEs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readEs, readEs, tid_16; +--:-:-:-:1 SHL readEs, readEs, 4; + +// writeCs = readIs * 512 + readEs; +--:-:-:-:1 ISCADD writeCs, readIs, readEs, 9; + +// readCs = tid//32 * 512 + tid & 31 +--:-:-:-:1 LOP.AND tid_31, tid, 31; +--:-:-:-:1 SHR.U32 tid_32, tid, 5; +--:-:-:-:1 ISCADD readCs, tid_32, tid_31, 9; +--:-:-:-:1 SHL readCs, readCs, 2; + +// kk = K_blk*32 + tid&31 +01:-:-:-:1 ISCADD kk, K_blk, tid_31, 5; + +// cc = C_blk*32 + tid//32 +--:-:-:-:1 ISCADD cc, C_blk, tid_32, 5; + +// F00 = c*RSK + r*SK + s*K + k +--:-:-:-:1 XMAD.LO2C trackF, cc, param_RSK, kk; + +[+ + our $determ; + if ($determ) + { + return q{ +--:-:-:-:1 MOV CRSK, param_CRSK; +01:-:-:-:1 XMAD PQ_blk, P_blk, param_strideX, Q_blk; +--:-:-:-:1 XMAD.LO trackF, PQ_blk, CRSK, trackF, xmad_determ; + }; + } + return ''; ++] + +--:-:-:-:1 LEA F00_0.CC, trackF, param_F[0], 2; +--:-:-:-:1 LEA.HI.X F00_1, trackF, param_F[1], RZ, 2; + +--:-:-:-:1 MOV K1, param_K; +--:-:-:-:1 SHL K1, K1, 2; + +--:-:-:-:1 MOV SK1, param_SK; +--:-:-:-:1 SHL SK1, SK1, 2; + +--:-:-:-:1 MOV RSK8, param_RSK; +--:-:-:-:1 SHL RSK8, RSK8, 5; + +--:-:-:-:1 ISETP.LT.AND P0, PT, kk, param_K, PT; + + +--:-:-:-:6 IADD F01_0.CC, F00_0, K1; +--:-:-:-:1 IADD.X F01_1, F00_1, RZ; +--:-:-:-:6 IADD F02_0.CC, F01_0, K1; +--:-:-:-:1 IADD.X F02_1, F01_1, RZ; + +--:-:-:-:6 IADD F10_0.CC, F00_0, SK1; +--:-:-:-:1 IADD.X F10_1, F00_1, RZ; +--:-:-:-:6 IADD F11_0.CC, F01_0, SK1; +--:-:-:-:1 IADD.X F11_1, F01_1, RZ; +--:-:-:-:6 IADD F12_0.CC, F02_0, SK1; +--:-:-:-:1 IADD.X F12_1, F02_1, RZ; + +--:-:-:-:6 IADD F20_0.CC, F10_0, SK1; +--:-:-:-:1 IADD.X F20_1, F10_1, RZ; +--:-:-:-:6 IADD F21_0.CC, F11_0, SK1; +--:-:-:-:1 IADD.X F21_1, F11_1, RZ; +--:-:-:-:6 IADD F22_0.CC, F12_0, SK1; +--:-:-:-:1 IADD.X F22_1, F12_1, RZ; + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*512 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*512 + 16>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*512 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*512 + 16>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:0 LOP.XOR readCs, readCs, 4x<8*512>; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*512 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*512 + 16>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*512 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*512 + 16>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:0 LOP.XOR readCs, readCs, 4x<8*512>; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:0 LOP.XOR readCs, readCs, 4x<8*512>; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + +--:-:-:-:0 ISETP.LT.AND P1, PT, cc, param_C, P0; // cc < C && kk < K +--:-:-:-:1 LDS m00, [readCs + 4x< 0*32>]; +--:-:-:-:1 LDS m10, [readCs + 4x< 4*32>]; +--:-:-:-:1 LDS m01, [readCs + 4x< 1*32>]; +--:-:1:-:1 LDS m11, [readCs + 4x< 5*32>]; + +--:-:-:-:0 IADD cc, cc, 8; +--:-:-:-:1 LDS m21, [readCs + 4x< 9*32>]; +--:-:-:-:1 LDS m02, [readCs + 4x< 2*32>]; +--:-:-:-:1 LDS m12, [readCs + 4x< 6*32>]; +--:-:2:-:1 LDS m22, [readCs + 4x<10*32>]; + +--:-:-:-:1 LDS m31, [readCs + 4x<13*32>]; +--:-:-:-:1 LDS m20, [readCs + 4x< 8*32>]; +--:-:-:-:1 LDS m32, [readCs + 4x<14*32>]; +--:-:3:-:1 LDS m03, [readCs + 4x< 3*32>]; + +--:-:-:-:1 LDS m13, [readCs + 4x< 7*32>]; +--:-:-:-:1 LDS m23, [readCs + 4x<11*32>]; +--:-:-:-:1 LDS m30, [readCs + 4x<12*32>]; +--:-:4:-:1 LDS m33, [readCs + 4x<15*32>]; + +01:-:-:-:1 FADD t00, m00, m10; +--:-:-:-:1 FADD t01, m01, m11; +02:-:-:-:1 FADD t21, m11, m21; +--:-:-:-:1 FADD t02, m02, m12; +--:-:-:-:1 FADD t11, m11, -m21; +--:-:-:-:1 FADD t22, m12, m22; +--:-:-:-:1 FADD t12, m12, -m22; +--:-:-:-:1 FADD t01, t01, m21; +04:-:-:-:1 FADD t21, t21, m31; +--:-:-:-:1 FADD t02, t02, m22; +--:-:-:-:1 FADD t20, m10, m20; +--:-:-:-:1 FADD t22, t22, m32; +--:-:-:-:1 FADD t00, t00, m20; +08:-:-:-:1 FADD t03, m03, m13; +--:-:-:-:1 FADD t10, m10, -m20; +--:-:-:-:1 FADD t23, m13, m23; +--:-:-:-:1 FADD t20, t20, m30; +--:-:-:-:1 FADD t13, m13, -m23; +--:-:-:-:1 FADD f00, t00, t01; +--:-:-:-:1 FADD t03, t03, m23; +--:-:-:-:1 FADD f02, t01, t02; +--:-:-:-:1 FADD t23, t23, m33; +--:-:-:-:1 FADD f10, t10, t11; +--:-:-:-:1 FADD f12, t11, t12; +--:-:-:-:1 FADD f20, t20, t21; +--:-:-:-:1 FADD f22, t21, t22; +--:-:-:-:1 FADD f00, f00, t02; +--:-:-:-:1 FADD f01, t01, -t02; +--:-:-:-:0 FADD f02, f02, t03; +--:-:-:-:1 @P1 [+ output_op() +] [F00_0], f00; +--:-:-:-:0 FADD f10, f10, t12; +--:-:-:-:1 @P1 [+ output_op() +] [F01_0], f01; +--:-:-:-:0 FADD f11, t11, -t12; +--:1:-:-:1 @P1 [+ output_op() +] [F02_0], f02; +--:-:-:-:0 FADD f12, f12, t13; +--:-:-:-:1 @P1 [+ output_op() +] [F10_0], f10; +--:-:-:-:0 FADD f20, f20, t22; +--:-:-:-:1 @P1 [+ output_op() +] [F11_0], f11; +--:-:-:-:0 FADD f21, t21, -t22; +--:2:-:-:1 @P1 [+ output_op() +] [F12_0], f12; +--:-:-:-:0 FADD f22, f22, t23; +--:-:-:-:1 @P1 [+ output_op() +] [F20_0], f20; +--:-:-:-:1 @P1 [+ output_op() +] [F21_0], f21; +--:3:-:-:1 @P1 [+ output_op() +] [F22_0], f22; + +01:-:-:-:6 IADD F00_0.CC, F00_0, RSK8; +--:-:-:-:1 IADD.X F00_1, F00_1, RZ; +--:-:-:-:6 IADD F01_0.CC, F01_0, RSK8; +--:-:-:-:1 IADD.X F01_1, F01_1, RZ; +--:-:-:-:6 IADD F02_0.CC, F02_0, RSK8; +--:-:-:-:1 IADD.X F02_1, F02_1, RZ; +02:-:-:-:6 IADD F10_0.CC, F10_0, RSK8; +--:-:-:-:1 IADD.X F10_1, F10_1, RZ; +--:-:-:-:6 IADD F11_0.CC, F11_0, RSK8; +--:-:-:-:1 IADD.X F11_1, F11_1, RZ; +--:-:-:-:6 IADD F12_0.CC, F12_0, RSK8; +--:-:-:-:1 IADD.X F12_1, F12_1, RZ; +04:-:-:-:6 IADD F20_0.CC, F20_0, RSK8; +--:-:-:-:1 IADD.X F20_1, F20_1, RZ; +--:-:-:-:6 IADD F21_0.CC, F21_0, RSK8; +--:-:-:-:1 IADD.X F21_1, F21_1, RZ; +--:-:-:-:6 IADD F22_0.CC, F22_0, RSK8; +--:-:-:-:0 IADD.X F22_1, F22_1, RZ; + +--:-:-:-:5 RET; + diff --git a/Kernel/Convolution/Pascal/xconv_winograd_3x3_4x4_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_3x3_4x4_32x32.sass new file mode 100644 index 0000000..20e8a9d --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_winograd_3x3_4x4_32x32.sass @@ -0,0 +1,1047 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our ($type, $D); +our $determ = $D; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $dtype_shift = $type eq 'h' ? '1' : '2'; +our $dtype_size = $type eq 'h' ? '2' : '4'; +our $vec_size = $type eq 'h' ? '64' : '128'; +sub dtype_shift { return $dtype_shift; } +sub vec_size { return $vec_size; } +sub output_op { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; } +-] + + + + addr_zero : 4x<32*36*2*4 + 64 + 0> + addr_rYXN : 4x<32*36*2*4 + 64 + 4> + addr_iYXN : 4x<32*36*2*4 + 64 + 5> + addr_idx_K : 4x<32*36*2*4 + 64 + 6> + addr_idx_C : 4x<32*36*2*4 + 64 + 7> + + param_F[0] : c[0x0][0x140] + param_F[1] : c[0x0][0x144] + param_I[0] : c[0x0][0x148] + param_I[1] : c[0x0][0x14c] + param_E[0] : c[0x0][0x150] + param_E[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_K : c[0x0][0x15c] + param_C : c[0x0][0x160] + param_k : c[0x0][0x164] + param_c : c[0x0][0x168] + param_kc : c[0x0][0x16c] + param_magic_kc : c[0x0][0x170] + param_shift_kc : c[0x0][0x174] + param_magic_c : c[0x0][0x178] + param_shift_c : c[0x0][0x17c] + param_YXN2 : c[0x0][0x180] + param_sYXN : c[0x0][0x184] + param_magic_sYXN : c[0x0][0x188] + param_shift_sYXN : c[0x0][0x18c] + param_stride_YXNp : c[0x0][0x190] + param_YXN : c[0x0][0x194] + param_YXN_1152 : c[0x0][0x198] + param_RSK : c[0x0][0x19c] + param_CRSK : c[0x0][0x1a0] + param_Kp : c[0x0][0x1a4] + param_SKp : c[0x0][0x1a8] + param_RSK15_SK2p : c[0x0][0x1ac] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10 : clx<0-3>y0 + 7, 6,15,14 : clx<0-3>y1 + 1, 0, 9, 8 : clx<0-3>y2 + 5, 4,13,12 : clx<0-3>y3 + 19,18,27,26 : clx<0-3>y4 + 23,22,31,30 : clx<0-3>y5 + 17,16,25,24 : clx<0-3>y6 + 21,20,29,28 : clx<0-3>y7 + + 32-43 : jl0Fx<0-3>, jl0Iy<0-7> + 44-51 : jl1Fx<0-3>, jl1Iy<4-7> + 36-39 : jl1Iy<0-3> + + 52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3> + 88-89 : track<0-1> + 90-91 ~ writeS + + 32-86 ~ idx_YXNkc, idx_K, idx_C, idx_YXN, div<1-3>, magic_kc, neg_kc, idx_kc, idx_k, idx_c, YXN2_idx, neg_sYXN, magic_sYXN, remainder, yxn, offset, offset2, tid32_2, tid1, tid31 + 87 = tid + + 32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1 + 48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16 + + + 3, 2,11,10,19,18,27,26 : ccx<0-7>y0 + 7, 6,15,14,23,22,31,30 : ccx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2 + 5, 4,13,12,21,20,29,28 : ccx<0-7>y3 + 35,34,43,42,51,50,59,58 : ccx<0-7>y4 + 39,38,47,46,55,54,63,62 : ccx<0-7>y5 + 33,32,41,40,49,48,57,56 : ccx<0-7>y6 + 37,36,45,44,53,52,61,60 : ccx<0-7>y7 + + 64-79 : jc0Fx<0-7>, jc0Iy<0-7> + 80-91 : jc1Fx<4-7>, jc1Iy<0-7> + 64-67 : jc1Fx<0-3> + + 64-86 ~ tid16, tid_1, tid128 + + 92-95 ~ reduce_YXN, swapBuf, readFs, readIs + + + 64-89 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxC, idxK, idxI, readFs2, readIs2, offsetF, k, CRSK, xmad_determ + 86-89 : Out1<0-1>, Out2<0-1> + 90-91 : Out0<0-1> + 92-95 ~ alpha, writeCs, readCs, c + + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + + 84-85 ~ t<0-1> + + 3, 2,11,19,10,18 : m<0-5>0 + 1, 9, 0, 8,17,16 : m<0-5>1 + 27,26,25,24,64,65 : m<0-5>2 + 66,67,68,69,70,71 : m<0-5>3 + 72,73,74,75,76,77 : m<0-5>4 + 78,79,80,81,82,83 : m<0-5>5 + + 3, 2,11 : w00, w10, w20 + 1, 9, 0 : w01, w11, w21 + 27,26,25 : w02, w12, w22 + 66,67,68 : w03, w13, w23 + 72,73,74 : w04, w14, w24 + 78,79,80 : w05, w15, w25 + + 19,10,18,69,70,71 ~ s00, s10, s20 + 8,17,16,75,76,77 ~ s02, s12, s22 + 24,64,65,81,82,83 ~ s01, s11, s21 + + + +--:-:-:-:0 MOV swapBuf, 4x<32*36*2*2>; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:-:-:1 STS.128 [addr_zero], RZ; +01:-:-:Y:d ISETP.GE.AND P0, PT, tid, 128, PT; +--:-:-:-:5 @P0 BRA.U COMPUTE_SETUP; + +############################################################## +LOAD_SETUP: + +--:-:1:-:1 S2R idx_YXNkc, SR_CTAID.X; +--:-:2:-:1 S2R idx_K, SR_CTAID.Z; +--:-:3:-:1 S2R idx_C, SR_CTAID.Y; + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +] + +--:-:-:-:1 ISETP.EQ.AND P0, PT, tid, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, tid, 64, PT; + +// idx_YXN = idx_YXNkc / blk_kc +--:-:-:-:1 MOV magic_kc, param_magic_kc; +--:-:-:-:1 IADD neg_kc, RZ, -param_kc; +--:-:-:-:1 ISETP.NE.AND P2, PT, magic_kc, 1, PT; +01:-:-:-:1 @P2 XMAD div1, idx_YXNkc, magic_kc, RZ; +--:-:-:-:1 @P2 XMAD div2, idx_YXNkc, magic_kc.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, idx_YXNkc.H1, magic_kc.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, idx_YXNkc.H1, magic_kc, div1; +--:-:-:-:1 @P2 IADD3.RS idx_YXN, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 idx_YXN, idx_YXN, param_shift_kc; +--:-:-:-:1 @!P2 SHR.U32 idx_YXN, idx_YXNkc, param_shift_kc; + +// idx_kc = idx_YXNkc % blk_kc +--:-:-:-:1 XMAD.LO2 idx_kc, neg_kc, idx_YXN, idx_YXNkc; + +// idx_k = idx_kc / blk_c +// idx_c = idx_kc % blk_c +--:-:-:-:1 XMAD idx_k, idx_kc, param_magic_c, RZ; +--:-:-:-:1 SHR.U32 idx_k, idx_k, param_shift_c; +--:-:-:-:1 XMAD idx_c, idx_k, param_c, RZ; +--:-:-:-:1 IADD idx_c, -idx_c, idx_kc; + +// idx_K = idx_K * blk_k + idx_k +// idx_C = idx_C * blk_c + idx_c +02:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; +04:-:-:-:1 XMAD idx_C, idx_C, param_c, idx_c; + +// reduce_YXN = ceil((YXN2 - idx_YXN) / sYXN) +--:-:-:-:1 IADD YXN2_idx, -idx_YXN, param_YXN2; +--:-:-:-:1 IADD neg_sYXN, RZ, -param_sYXN; +--:-:-:-:1 MOV magic_sYXN, param_magic_sYXN; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_sYXN, 1, PT; +--:-:-:-:1 @P3 XMAD div1, YXN2_idx, magic_sYXN, RZ; +--:-:-:-:1 @P3 XMAD div2, YXN2_idx, magic_sYXN.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, YXN2_idx.H1, magic_sYXN.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, YXN2_idx.H1, magic_sYXN, div1; +--:-:-:-:1 @P3 IADD3.RS reduce_YXN, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 reduce_YXN, reduce_YXN, param_shift_sYXN; +--:-:-:-:1 @!P3 SHR.U32 reduce_YXN, YXN2_idx, param_shift_sYXN; + +--:-:-:-:1 XMAD.LO2 remainder, neg_sYXN, reduce_YXN, YXN2_idx; +--:-:-:-:1 IMNMX.U32 remainder, remainder, 1, PT; +--:-:-:-:1 IADD reduce_YXN, reduce_YXN, remainder; + +--:-:-:-:1 @P0 STS [addr_iYXN], idx_YXN; +--:-:-:-:1 @P0 STS [addr_idx_K], idx_K; +--:-:-:-:1 @P0 STS [addr_idx_C], idx_C; +--:6:-:-:1 @P0 STS [addr_rYXN], reduce_YXN; + +// yxn = (tid & 63) >> 5 +--:-:-:-:1 BFE.U32 yxn, tid, 0x105; // 1 bit at position 5 + +// offset = (idx_YXN + (reduce_YXN - 1)*sYXN)*2 + yxn +--:-:-:-:1 IADD offset, reduce_YXN, -1; +--:-:-:-:1 XMAD offset2, offset, param_sYXN, idx_YXN; +--:-:-:-:1 XMAD.PSL offset2, offset.H1, param_sYXN, offset2; +--:-:-:-:1 ISCADD offset2, offset2, yxn, 1; + +// P6 = offset < YXN +--:-:-:-:1 ISETP.LT.AND P6, PT, offset2, param_YXN, PT; + +// P5 = reduce_YXN > 1 +--:-:-:-:1 ISETP.GT.AND P5, PT, reduce_YXN, 1, PT; + +--:-:-:-:1 LOP.AND tid32_2, tid, -32; +--:-:-:-:1 SHR.U32 tid32_2, tid32_2, 2; + +// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7) +--:-:-:-:1 BFE.U32 readFs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readFs, readFs, tid32_2; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*36*2>, 4; + +// readIs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readIs, tid, 16; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 IADD3 readIs, readIs, tid1, tid32_2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// writeS = (yxn*32*36 + (tid & 31)*4)*4 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL writeS, tid31, 4; +--:-:-:-:1 XMAD writeS, yxn, 4x<32*36>, writeS; + +// offset = offset*32*36 + tid31*4 +--:-:-:-:1 SHL tid31, tid31, 2; +--:-:-:-:0 XMAD.LO2 offset, offset2, 1x<32*36>, tid31; + + +--:-:-:-:6 @P1 BRA.U FILTER_SETUP; + +############################################################## +IMAGE_SETUP: + + +// (GC32,GY,GX,N,6,6,32) +// offset += idx_C * YXN*32*36 +--:-:-:-:1 XMAD.LO2C offset, idx_C, param_YXN_1152, offset; + +--:-:-:-:1 LEA track0.CC, offset, param_I[0], [+ dtype_shift() +]; +--:-:-:-:0 LEA.HI.X track1, offset, param_I[1], RZ, [+ dtype_shift() +]; + + +--:-:-:-:6 BRA.U LOAD; + +############################################################## +FILTER_SETUP: + + +// writeS += 32*36*2*4 +--:-:-:-:1 IADD writeS, writeS, 4x<32*36*2>; + +// (GK32,GY,GX,N,6,6,32) +// offset += idx_K * YXN*32*36 +--:-:-:-:1 XMAD.LO2C offset, idx_K, param_YXN_1152, offset; + +--:-:-:-:1 LEA track0.CC, offset, param_E[0], [+ dtype_shift() +]; +--:-:-:-:2 LEA.HI.X track1, offset, param_E[1], RZ, [+ dtype_shift() +]; + + +############################################################## +LOAD: + +20:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>]; +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>]; +--:-:2:-:1 @P6 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T0, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T1, [addr_zero]; +--:-:2:-:1 @!P6 LDS.U.[+ vec_size() +] T2, [addr_zero]; + +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>]; +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>]; +--:-:3:-:1 @P6 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T3, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T4, [addr_zero]; +--:-:3:-:1 @!P6 LDS.U.[+ vec_size() +] T5, [addr_zero]; + +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>]; +--:-:-:-:1 @P6 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>]; +--:-:4:-:1 @P6 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T6, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T7, [addr_zero]; +--:-:4:-:1 @!P6 LDS.U.[+ vec_size() +] T8, [addr_zero]; + +[+ + our $convert_in; + return $convert_in ? q{ + +02:-:-:-:1 F2F.F32.F16 T03, T01.H1; +--:-:-:-:1 F2F.F32.F16 T02, T01.H0; +--:-:-:-:1 F2F.F32.F16 T01, T00.H1; +--:-:2:-:1 F2F.F32.F16 T00, T00.H0; + +--:-:-:-:1 F2F.F32.F16 T13, T11.H1; +--:-:-:-:1 F2F.F32.F16 T12, T11.H0; +--:-:-:-:1 F2F.F32.F16 T11, T10.H1; +--:-:5:-:1 F2F.F32.F16 T10, T10.H0; + +--:-:-:-:1 F2F.F32.F16 T23, T21.H1; +--:-:-:-:1 F2F.F32.F16 T22, T21.H0; +--:-:-:-:1 F2F.F32.F16 T21, T20.H1; +--:-:6:-:1 F2F.F32.F16 T20, T20.H0; + +02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0; + +04:-:-:-:1 F2F.F32.F16 T33, T31.H1; +--:-:-:-:1 F2F.F32.F16 T32, T31.H0; +--:-:-:-:1 F2F.F32.F16 T31, T30.H1; +--:-:3:-:1 F2F.F32.F16 T30, T30.H0; + +10:-:-:-:1 STS.128 [writeS + 4x<1*32*4>], T1; + +--:-:-:-:1 F2F.F32.F16 T43, T41.H1; +--:-:-:-:1 F2F.F32.F16 T42, T41.H0; +--:-:-:-:1 F2F.F32.F16 T41, T40.H1; +--:-:5:-:1 F2F.F32.F16 T40, T40.H0; + +20:-:-:-:1 STS.128 [writeS + 4x<2*32*4>], T2; + +--:-:-:-:1 F2F.F32.F16 T53, T51.H1; +--:-:-:-:1 F2F.F32.F16 T52, T51.H0; +--:-:-:-:1 F2F.F32.F16 T51, T50.H1; +--:-:6:-:1 F2F.F32.F16 T50, T50.H0; + +04:-:-:-:1 STS.128 [writeS + 4x<3*32*4>], T3; + +08:-:-:-:1 F2F.F32.F16 T63, T61.H1; +--:-:-:-:1 F2F.F32.F16 T62, T61.H0; +--:-:-:-:1 F2F.F32.F16 T61, T60.H1; +--:-:4:-:1 F2F.F32.F16 T60, T60.H0; + +10:-:-:-:1 STS.128 [writeS + 4x<4*32*4>], T4; + +--:-:-:-:1 F2F.F32.F16 T73, T71.H1; +--:-:-:-:1 F2F.F32.F16 T72, T71.H0; +--:-:-:-:1 F2F.F32.F16 T71, T70.H1; +--:-:5:-:1 F2F.F32.F16 T70, T70.H0; + +20:-:-:-:1 STS.128 [writeS + 4x<5*32*4>], T5; + +--:-:-:-:1 F2F.F32.F16 T83, T81.H1; +--:-:-:-:1 F2F.F32.F16 T82, T81.H0; +--:-:-:-:1 F2F.F32.F16 T81, T80.H1; +--:-:6:-:1 F2F.F32.F16 T80, T80.H0; + +08:-:-:-:1 STS.128 [writeS + 4x<6*32*4>], T6; +10:-:-:-:1 STS.128 [writeS + 4x<7*32*4>], T7; +20:-:-:-:1 STS.128 [writeS + 4x<8*32*4>], T8; + + } : q{ +02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0; +--:-:-:-:1 STS.128 [writeS + 4x<1*32*4>], T1; +--:-:-:-:1 STS.128 [writeS + 4x<2*32*4>], T2; +04:-:-:-:1 STS.128 [writeS + 4x<3*32*4>], T3; +--:-:-:-:1 STS.128 [writeS + 4x<4*32*4>], T4; +--:-:-:-:1 STS.128 [writeS + 4x<5*32*4>], T5; +08:-:-:-:1 STS.128 [writeS + 4x<6*32*4>], T6; +--:-:-:-:1 STS.128 [writeS + 4x<7*32*4>], T7; +--:-:-:-:1 STS.128 [writeS + 4x<8*32*4>], T8; + }; ++] + +--:-:-:-:0 IADD track0.CC, track0, -param_stride_YXNp; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:1 LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>]; +--:-:1:-:1 LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>]; + +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>]; +--:-:2:-:1 @P5 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>]; +--:-:3:-:1 @P5 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>]; +--:-:-:-:1 @P5 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>]; +--:6:4:-:1 @P5 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>]; + +--:-:-:-:5 BRA.U LOAD_LOOP; + +############################################################## + +COMPUTE_SETUP: + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +--:-:-:-:1 IADD tid128, tid, -128; + +// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readIs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid128, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readFs, tid128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readFs, readFs, tid16; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*4 + 32*36*2>, 4; + +--:-:-:-:1 LOP.AND tid_1, tid128, 1; +--:-:-:-:1 LOP.AND readIs, tid128, 8; +--:-:-:-:1 SHR.U32 readIs, readIs, 2; +--:-:-:-:1 IADD3 readIs, readIs, tid16, tid_1; +--:-:-:-:0 ISCADD readIs, readIs, 4x<32*4>, 4; + + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS reduce_YXN, [addr_rYXN]; + +--:-:-:-:1 LDS.U.128 jc0Iy0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Fx0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Iy4, [readIs + 4x<0*32*36 + 16>]; +--:-:1:-:2 LDS.U.128 jc0Fx4, [readFs + 4x<0*32*36 + 16>]; + +COMPUTE_LOOP: +[+ + my %insert = ( + + j0c33 => "--:-:-:-:1 ISETP.GT.AND P0, PT, reduce_YXN, 1, PT;\n" . + "--:-:-:-:1 IADD reduce_YXN, reduce_YXN, -1;\n", + + j0c62 => "02:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readIs, readIs, swapBuf;\n" . + "--:-:-:-:1 IADD readFs, readFs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j1c63 => "--:-:-:Y:5 \@P0 BRA.U COMPUTE_LOOP;\n" . + "--:-:-:Y:5 BRA.U COMPUTE_FINISH;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 1) + { + my $odd = $j; + my $nOdd = 1 - $j; + my $rsPred = $j == 1 ? '@P0' : ' '; + my $bar = $j == 0 ? '2' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIy4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFx4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIy0, [readIs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd; + + $insert{"j${j}c31"} = sprintf "--:%s:1:-:1 %s LDS.U.128 jc%dFx0, [readFs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd; + + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $yield = $c % 10 == 0 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA ccx%dy%d, jc%dFx%d, jc%dIy%d, ccx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +LOAD_LOOP: +--:-:-:-:1 ISETP.GT.AND P0, PT, reduce_YXN, 1, PT; +20:-:-:-:1 IADD track0.CC, track0, -param_stride_YXNp; +--:-:-:-:1 ISETP.GT.AND P1, PT, reduce_YXN, 2, PT; +--:-:-:-:1 IADD reduce_YXN, reduce_YXN, -1; +[+ + our ($vec_size, $dtype_size, $convert_in); + my %insert = ( + + j0c3 => "--:-:-:-:1 IADD.X track1, track1, -RZ;\n", + + j0c0 => "--:-:-:-:1 LDS.U.128 jl1Iy4, [readIs + 4x<1*32*36 + 16>];\n", + j0c2 => "--:-:-:-:1 LDS.U.128 jl1Fx0, [readFs + 4x<1*32*36 + 00>];\n", + j0c18 => "--:-:1:-:1 LDS.U.128 jl1Iy0, [readIs + 4x<1*32*36 + 00>];\n", + + j1c12 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>];\n", + j1c14 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>];\n", + j1c16 => "--:-:1:-:1 \@P0 LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>];\n", + + $convert_in ? ( + + j0c1 => "02:-:-:-:1 F2F.F32.F16 T03, T01.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T02, T01.H0;\n", + j0c4 => "--:-:-:-:1 F2F.F32.F16 T01, T00.H1;\n" . + "--:-:2:-:1 F2F.F32.F16 T00, T00.H0;\n", + + j0c5 => "--:-:-:-:1 F2F.F32.F16 T13, T11.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T12, T11.H0;\n", + j0c6 => "--:-:-:-:1 F2F.F32.F16 T11, T10.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T10, T10.H0;\n", + + j0c7 => "--:-:-:-:1 F2F.F32.F16 T23, T21.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T22, T21.H0;\n", + j0c8 => "--:-:-:-:1 F2F.F32.F16 T21, T20.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T20, T20.H0;\n", + + j0c9 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n", + j0c10 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n", + j0c11 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n", + + j0c13 => "02:-:-:-:1 \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n", + j0c14 => "10:-:-:-:1 \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n", + j0c15 => "20:-:2:-:1 \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n", + + j0c16 => "04:-:-:-:1 F2F.F32.F16 T33, T31.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T32, T31.H0;\n", + j0c17 => "--:-:-:-:1 F2F.F32.F16 T31, T30.H1;\n" . + "--:-:3:-:1 F2F.F32.F16 T30, T30.H0;\n", + + j0c19 => "--:-:-:-:1 F2F.F32.F16 T43, T41.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T42, T41.H0;\n", + j0c20 => "--:-:-:-:1 F2F.F32.F16 T41, T40.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T40, T40.H0;\n", + + j0c21 => "--:-:-:-:1 F2F.F32.F16 T53, T51.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T52, T51.H0;\n", + j0c22 => "--:-:-:-:1 F2F.F32.F16 T51, T50.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T50, T50.H0;\n", + + j0c23 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n", + j0c24 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n", + j0c25 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n", + + j0c27 => "04:-:-:-:1 \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n", + j0c28 => "10:-:-:-:1 \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n", + j0c29 => "20:-:3:-:1 \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n", + + j0c30 => "08:-:-:-:1 F2F.F32.F16 T63, T61.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T62, T61.H0;\n", + j0c31 => "--:-:-:-:1 F2F.F32.F16 T61, T60.H1;\n" . + "--:-:4:-:1 F2F.F32.F16 T60, T60.H0;\n", + + j1c0 => "--:-:-:-:1 F2F.F32.F16 T73, T71.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T72, T71.H0;\n", + j1c1 => "--:-:-:-:1 F2F.F32.F16 T71, T70.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T70, T70.H0;\n", + + j1c2 => "--:-:-:-:1 F2F.F32.F16 T83, T81.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T82, T81.H0;\n", + j1c3 => "--:-:-:-:1 F2F.F32.F16 T81, T80.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T80, T80.H0;\n", + + j1c4 => "08:4:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n", + j1c5 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n", + j1c6 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n", + + j1c8 => "08:-:-:-:1 \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n", + j1c9 => "10:-:-:-:1 \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n", + j1c10 => "20:6:4:-:1 \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n", + + ) : ( + + j0c6 => "02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0;\n", + j0c8 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n", + j0c10 => "--:2:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n", + + j0c12 => "02:-:-:-:1 \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n", + j0c14 => "--:-:-:-:1 \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n", + j0c16 => "--:-:2:-:1 \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n", + + j0c20 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n", + j0c22 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n", + j0c24 => "--:3:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n", + + j0c26 => "04:-:-:-:1 \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n", + j0c28 => "--:-:-:-:1 \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n", + j0c30 => "--:-:3:-:1 \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n", + + j1c0 => "08:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n", + j1c2 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n", + j1c4 => "--:4:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n", + + j1c6 => "08:-:-:-:1 \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n", + j1c8 => "--:-:-:-:1 \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n", + j1c10 => "--:6:4:-:1 \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n", + ), + + j1c11 => "--:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j1c31 => "--:-:-:Y:5 \@P0 BRA.U LOAD_LOOP;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "01" : '--'; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dFx%d, jl%dIy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + +--:-:1:-:2 S2R Tid, SR_TID.X; + +--:-:-:-:1 MOV alpha16, param_alpha; + +01:-:-:-:1 LOP.AND Tid32_2, Tid, -32; +--:-:-:-:1 SHR.U32 Tid32_2, Tid32_2, 2; + +// readIs = ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND Tid1, Tid, 1; +01:-:-:-:1 LOP.AND readIs, Tid, 16; +--:-:-:-:1 SHR.U32 readIs, readIs, 3; +--:-:-:-:1 IADD readIs, readIs, Tid1; + +// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readIs << 2) +--:-:-:-:1 BFE.U32 readFs, Tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readFs, readFs, Tid32_2; +--:-:-:-:1 ISCADD readFs, readIs, readFs, 2; + +--:-:-:-:1 SHL readFs, readFs, 4; +--:-:-:-:1 SHL readIs, readIs, 3; + +// writeCs = readIs * 32*36 + readFs; +--:-:-:-:1 XMAD write16Cs, readIs, 1x<32*36>, readFs; + + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y2, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y2, alpha16; +--:-:-:-:4 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y3, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y3, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y6, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y6, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y7, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y7, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 EXIT; + +COMPUTE_FINISH: + +--:-:1:-:2 S2R tid_128, SR_TID.X; + + +--:-:-:-:1 MOV alpha, param_alpha; + +01:-:-:-:1 IADD tid_128, tid_128, -128; + +--:-:-:-:1 ISETP.GE.AND P4, PT, tid_128, 256, PT; + +// readIs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND Tid_1, tid_128, 1; +--:-:-:-:1 LOP.AND readIs2, tid_128, 8; +--:-:-:-:1 SHR.U32 readIs2, readIs2, 2; +--:-:-:-:1 IADD readIs2, readIs2, Tid_1; + +// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readIs2 << 2) +--:-:-:-:1 LOP.AND tid_16, tid_128, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; +--:-:-:-:1 BFE.U32 readFs2, tid_128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readFs2, readFs2, tid_16; +--:-:-:-:1 ISCADD readFs2, readIs2, readFs2, 2; + +--:-:-:-:1 ISCADD readFs2, readFs2, 4x<32*4>, 4; +--:-:-:-:1 SHL readIs2, readIs2, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:0 XMAD writeCs, readIs2, 1x<32*36>, readFs2; + + +--:-:-:-:5 @P4 BRA.U SKIP0; + +--:-:1:-:1 LDS idxK, [addr_idx_K]; +--:-:2:-:1 LDS idxC, [addr_idx_C]; +[+ our $determ; return $determ ? q{--:-:3:-:1 LDS idxI, [addr_iYXN];} : ''; +] + + + +--:-:-:-:1 LOP.AND tid_31, tid_128, 31; +--:-:-:-:1 SHR.U32 tid_32, tid_128, 5; +--:-:-:-:1 SHR.U32 tid_64, tid_128, 6; + +// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16 +--:-:-:-:1 XMAD readCs, tid_32, 1x<32*36>, tid_31; +--:-:-:-:1 ISCADD readCs, tid_64, readCs, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// k = K_blk*32 + tid_31 +// c = C_blk*32 + tid_32<<1 +--:-:-:-:1 SHL tid_32, tid_32, 1; +01:-:-:-:1 ISCADD k, idxK, tid_31, 5; +02:-:-:-:1 ISCADD c, idxC, tid_32, 5; + + +// offsetF = c*RSK + r*SK + s*K + k +--:-:-:-:1 XMAD.LO2C offsetF, c, param_RSK, k; + +[+ + our $determ; + return $determ ? q{ +--:-:-:-:1 MOV CRSK, param_CRSK; +04:-:-:-:1 XMAD.LO offsetF, idxI, CRSK, offsetF, xmad_determ; + } : ''; ++] + +--:-:-:-:1 LEA Out00.CC, offsetF, param_F[0], 2; +--:-:-:-:1 LEA.HI.X Out01, offsetF, param_F[1], RZ, 2; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, k, param_K, PT; + + +SKIP0: + + +--:-:-:-:1 IADD Out10.CC, Out00, param_Kp; +--:-:-:-:1 IADD.X Out11, Out01, RZ; +--:-:-:-:1 IADD Out20.CC, Out10, param_Kp; +--:-:-:-:1 IADD.X Out21, Out11, RZ; + +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, ccx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, ccx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, ccx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP1; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +01:-:-:-:5 IADD Out00.CC, Out00, param_SKp; +--:-:-:-:1 IADD c, c, 1; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_SKp; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_SKp; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + +SKIP1: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y1, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP2; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +01:-:-:-:5 IADD Out00.CC, Out00, param_RSK15_SK2p; +--:-:-:-:1 IADD c, c, 15; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_RSK15_SK2p; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_RSK15_SK2p; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + +SKIP2: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y4, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y6, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP3; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +01:-:-:-:5 IADD Out00.CC, Out00, param_SKp; +--:-:-:-:1 IADD c, c, 1; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_SKp; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_SKp; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + +SKIP3: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y5, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP4; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP4: + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + +--:-:-:-:0 ISETP.LT.AND P1, PT, c, param_C, P0; + +[+ + my $out; + foreach my $i (0 .. 5) + { + foreach my $j (0 .. 5) + { + my $b = $j == 5 ? $i + 1 : '-'; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + +[+ + my $out; + foreach my $i (0 .. 5) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ +$w:-:-:-:1 FADD t0, m1$i, m2$i; +$w:-:-:-:1 FADD t1, m3$i, m4$i; +--:-:-:-:1 FADD m1$i, m1$i, -m2$i; +--:-:-:-:1 FADD m3$i, m3$i, -m4$i; +--:-:-:-:1 FADD w0$i, m0$i, t0; +--:-:-:-:1 FADD w0$i, w0$i, t1; +--:-:-:-:1 FMUL w1$i, m1$i, 0.625; +--:-:-:-:1 FFMA w1$i, m3$i, 1.5, w1$i; +--:-:-:-:1 FFMA w2$i, t1, 2.25, m5$i; +--:-:-:-:1 FFMA w2$i, t0, 0.390625, w2$i; + }; + } + return $out; ++] + + + +[+ + my $out; + foreach my $i (0 .. 2) + { + $out .= qq{ +--:-:-:-:1 FADD t0, w${i}1, w${i}2; +--:-:-:-:1 FADD t1, w${i}3, w${i}4; +--:-:-:-:1 FADD w${i}1, w${i}1, -w${i}2; +--:-:-:-:1 FADD w${i}3, w${i}3, -w${i}4; +--:-:-:-:1 FADD s${i}0, w${i}0, t0; +--:-:-:-:1 FADD s${i}0, s${i}0, t1; +--:-:-:-:1 FMUL s${i}1, w${i}1, 0.625; +--:-:-:-:1 FFMA s${i}1, w${i}3, 1.5, s${i}1; +--:-:-:-:1 FFMA s${i}2, t1, 2.25, w${i}5; +--:-:-:-:1 FFMA s${i}2, t0, 0.390625, s${i}2; + }; + } + return $out; ++] + +//--:-:1:-:1 I2F.F32.S32 temp, c; + + +--:1:-:-:1 @P1 [+ output_op() +] [Out0], s00; +--:2:-:-:1 @P1 [+ output_op() +] [Out1], s01; +--:3:-:-:1 @P1 [+ output_op() +] [Out2], s02; +01:-:-:-:6 IADD Out00.CC, Out00, param_SKp; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_SKp; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_SKp; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + + + +--:1:-:-:1 @P1 [+ output_op() +] [Out0], s10; +--:2:-:-:1 @P1 [+ output_op() +] [Out1], s11; +--:3:-:-:1 @P1 [+ output_op() +] [Out2], s12; +01:-:-:-:6 IADD Out00.CC, Out00, param_SKp; +--:-:-:-:1 IADD.X Out01, Out01, RZ; +02:-:-:-:6 IADD Out10.CC, Out10, param_SKp; +--:-:-:-:1 IADD.X Out11, Out11, RZ; +04:-:-:-:6 IADD Out20.CC, Out20, param_SKp; +--:-:-:-:1 IADD.X Out21, Out21, RZ; + + + +--:1:-:-:1 @P1 [+ output_op() +] [Out0], s20; +--:2:-:-:1 @P1 [+ output_op() +] [Out1], s21; +--:3:-:-:1 @P1 [+ output_op() +] [Out2], s22; + + + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32.sass new file mode 100644 index 0000000..d4b2941 --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32.sass @@ -0,0 +1,1237 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $type; +our $dtype = $type eq 'h' ? 'U16' : '32'; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $dshift = $type eq 'h' ? '1' : '2'; +our $dsize = $type eq 'h' ? '2' : '4'; +our $vsize = $type eq 'h' ? '64' : '128'; +sub dtype { return $dtype; } +sub dsize { return $dsize; } +sub dshift { return $dshift; } +sub vsize { return $vsize; } +-] + + + + addr_zero : 4x<32*36*2*4 + 64 + 0> + addr_idx_Y : 4x<32*36*2*4 + 64 + 4> + addr_idx_X : 4x<32*36*2*4 + 64 + 5> + addr_idx_K : 4x<32*36*2*4 + 64 + 6> + + param_S[0] : c[0x0][0x140] + param_S[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_C : c[0x0][0x174] + param_K : c[0x0][0x178] + param_N : c[0x0][0x17c] + param_Y : c[0x0][0x180] + param_W : c[0x0][0x184] + param_YXN : c[0x0][0x188] + param_XN : c[0x0][0x18c] + param_Y2 : c[0x0][0x190] + param_GX : c[0x0][0x194] + param_Xk : c[0x0][0x198] + param_k : c[0x0][0x19c] + param_magic_Xk : c[0x0][0x1a0] + param_shift_Xk : c[0x0][0x1a4] + param_magic_k : c[0x0][0x1a8] + param_shift_k : c[0x0][0x1ac] + param_P : c[0x0][0x1b0] + param_Q : c[0x0][0x1b4] + param_QN : c[0x0][0x1b8] + param_PQN : c[0x0][0x1bc] + param_PQN15 : c[0x0][0x1c0] + param_maskN : c[0x0][0x1c4] + param_shiftX : c[0x0][0x1c8] + param_shiftY : c[0x0][0x1cc] + param_superX : c[0x0][0x1d0] + param_superY : c[0x0][0x1d4] + param_pad_x : c[0x0][0x1d8] + param_pad_y : c[0x0][0x1dc] + param_RSK : c[0x0][0x1e0] + param_RSK2p : c[0x0][0x1e4] + param_YXN2p : c[0x0][0x1e8] + param_gridN : c[0x0][0x1ec] + param_gridQN : c[0x0][0x1f0] + param_gridPQN : c[0x0][0x1f4] + + + + + 0-63 : czero<00-63> + + 3, 2,11,10 : clx<0-3>y0 + 7, 6,15,14 : clx<0-3>y1 + 1, 0, 9, 8 : clx<0-3>y2 + 5, 4,13,12 : clx<0-3>y3 + 19,18,27,26 : clx<0-3>y4 + 23,22,31,30 : clx<0-3>y5 + 17,16,25,24 : clx<0-3>y6 + 21,20,29,28 : clx<0-3>y7 + + 32-43 : jl0Ix<0-3>, jl0Fy<0-7> + 44-51 : jl1Ix<0-3>, jl1Fy<4-7> + 36-39 : jl1Fy<0-3> + + 32-43 ~ swapBuff + + 88-89 : track<0-1> + 90-92 : writeS, pred30, pred36 + + // Image Transform + 44-51 ~ ti<0-5> + + 52,53,54,56,57,55 : i<0-5>0 + 59,60,61,63,58,62 : i<0-5>1 + 66,67,68,64,65,69 : i<0-5>2 + 73,74,75,71,72,70 : i<0-5>3 + 87,82,83,85,86,84 : i<0-5>4 + 80,81,76,78,79,77 : i<0-5>5 + + 52,53,54,56,57,55 : TI<0-5>0 + 59,60,61,63,58,62 : TI<0-5>1 + 66,67,68,64,65,69 : TI<0-5>2 + 73,74,75,71,72,70 : TI<0-5>3 + 87,82,83,85,86,84 : TI<0-5>4 + 80,81,76,78,79,77 : TI<0-5>5 + + 52,53,54,56,57,55 : I<0-5>0 + 59,60,61,63,58,62 : I<0-5>1 + 66,67,68,64,65,69 : I<0-5>2 + 73,74,75,71,72,70 : I<0-5>3 + 87,82,83,85,86,84 : I<0-5>4 + 80,81,76,78,79,77 : I<0-5>5 + + // Filter Transform + 44-47 ~ rcp6, rcp8, rcp12, rcp24 + + 52,53,54 : f<0-2>0 + 55,56,57 : f<0-2>1 + 58,59,60 : f<0-2>2 + + 61,62,63 : tf<0-2>0 + 64,65,66 : tf<0-2>1 + 67,68,69 : tf<0-2>2 + + 70,71,72,73,74,54 : TF<0-5>0 + 76,77,78,79,80,57 : TF<0-5>1 + 82,83,84,85,86,60 : TF<0-5>2 + + 61,64,48,49,50,51 : ff<0-5>0 + 52,53,55,56,58,59 : ff<0-5>1 + 61,64,48,49,50,51 : ff<0-5>2 + + 70,71,72,73,74,54 : F<0-5>0 + 62,63,65,66,67,68 : F<0-5>1 + 52,53,55,56,58,59 : F<0-5>2 + 69,75,81,87,76,77 : F<0-5>3 + 61,64,78,79,80,57 : F<0-5>4 + 82,83,84,85,86,60 : F<0-5>5 + + 32-39 ~ partialC, idx_K, idx_Y, idx_X + 40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, idx_Y2, idx_X2, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, gx, gy, c, kk, offset, sign, idx_N, nn, x<1-5>, mask_x, super_x, super_y, partC + + 32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1 + 48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16 + + + 3, 2,11,10,19,18,27,26 : ccx<0-7>y0 + 7, 6,15,14,23,22,31,30 : ccx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2 + 5, 4,13,12,21,20,29,28 : ccx<0-7>y3 + 35,34,43,42,51,50,59,58 : ccx<0-7>y4 + 39,38,47,46,55,54,63,62 : ccx<0-7>y5 + 33,32,41,40,49,48,57,56 : ccx<0-7>y6 + 37,36,45,44,53,52,61,60 : ccx<0-7>y7 + + 64-79 : jc0Ix<0-7>, jc0Fy<0-7> + 80-91 : jc1Ix<4-7>, jc1Fy<0-7> + 64-67 : jc1Ix<0-3> + + 64-86 ~ tid16, tid_1, tid128 + 92 = swapBuf + + 87 = tid + 93-95 ~ C, readFs, readIs + + 64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q + 86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset + + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + + // t00 80 r00 78 + // t10 m10 r01 w01 + // t20 m20 r02 w02 + // t30 m30 r03 w03 + // w00 m00 s00 w00 + // w30 m40 s01 w01 + // w10 m10 s02 w02 + // w20 m20 s03 w04 + + 78 = t0<0-5>, r<0-3>0 + 79 = temp + + 3, 2,11,10,19,18 : m<0-5>0 + 1, 9, 0, 8,17,16 : m<0-5>1 + 27,26,25,24,64,65 : m<0-5>2 + 2,11,10 : t10, t20, t30 + 9, 0, 8 : t11, t21, t31 + 26,25,24 : t12, t22, t32 + 3, 2,11,19 : w00, w10, w20, w30 + 1, 9, 0,17 : w01, w11, w21, w31 + 27,26,25,64 : w02, w12, w22, w32 + + 66,67,68,69,70,71 : m<0-5>3 + 72,73,74,75,76,77 : m<0-5>4 + 8,24,10,65,16,18 : m<0-5>5 + 67,68,69 : t13, t23, t33 + 73,74,75 : t14, t24, t34 + 24,10,65 : t15, t25, t35 + 66,67,68,70 : w03, w13, w23, w33 + 72,73,74,76 : w04, w14, w24, w34 + 8,24,10,16 : w05, w15, w25, w35 + + 1,27,66 : r01, r02, r03 + 9,26,67 : r11, r12, r13 + 0,25,68 : r21, r22, r23 + 17,64,70 : r31, r32, r33 + 3, 1,27,72 : s00, s01, s02, s03 + 2, 9,26,73 : s10, s11, s12, s13 + 11, 0,25,74 : s20, s21, s22, s23 + 19,17,64,76 : s30, s31, s32, s33 + + 80-83 ~ xx<0-3> + 78-81 ~ sum<0-3> + 82-83 : Sum<0-1> + 84-85 : Out<0-1> + + 8,10,16,18 ~ b0<0-3> + 24,65,66,67 ~ b1<0-3> + 68,69,70,71 ~ b2<0-3> + 75,77,78,79 ~ b3<0-3> + + + +--:-:-:-:0 MOV C, param_C; +--:-:1:-:2 S2R tid, SR_TID.X; +01:-:-:-:0 ISETP.GE.AND P0, PT, tid, 128, PT; +--:-:-:-:1 STS.128 [addr_zero], RZ; +--:-:-:Y:c LOP.AND partialC, C, 1; +--:-:-:-:0 IADD C, C, partialC; +--:-:-:-:5 @P0 BRA.U COMPUTE_SETUP; + +############################################################## +LOAD_SETUP: + +--:-:1:-:1 S2R idx_YXk, SR_CTAID.X; +--:-:2:-:1 S2R idx_K, SR_CTAID.Y; + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +] + +--:-:-:-:1 ISETP.EQ.AND P0, PT, tid, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, tid, 64, PT; + +// idx_Y2 = idx_YXk / blk_Xk +--:-:-:-:1 MOV magic_Xk, param_magic_Xk; +--:-:-:-:1 IADD negXk, RZ, -param_Xk; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_Xk, 1, PT; +01:-:-:-:1 @P3 XMAD div1, idx_YXk, magic_Xk, RZ; +--:-:-:-:1 @P3 XMAD div2, idx_YXk, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, idx_YXk.H1, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk, div1; +--:-:-:-:1 @P3 IADD3.RS idx_Y2, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 idx_Y2, idx_Y2, param_shift_Xk; +--:-:-:-:1 @!P3 SHR.U32 idx_Y2, idx_YXk, param_shift_Xk; + +// idx_Xk = idx_YXk % blk_Xk +--:-:-:-:1 XMAD.LO2 idx_Xk, negXk, idx_Y2, idx_YXk; + +// idx_X2 = idx_Xk / blk_k +// idx_k = idx_Xk % blk_k +--:-:-:-:1 XMAD idx_X2, idx_Xk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_X2, idx_X2, param_shift_k; +--:-:-:-:1 XMAD idx_k, idx_X2, param_k, RZ; +--:-:-:-:1 IADD idx_k, -idx_k, idx_Xk; + +// idx_K = idx_K * blk_k + idx_k +02:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +// gx = x2 +// gy = y2 * 2 +--:-:-:-:1 MOV idx_X, idx_X2; +--:-:-:-:1 SHL idx_Y, idx_Y2, 1; + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// if y2 != Y2: +// gy += (gx&1) ^ ((gx&2)>>1) +// gx /= 2 +--:-:-:-:1 ISETP.NE.AND P4, PT, idx_Y2, param_Y2, PT; +--:-:-:-:1 @P4 LOP.AND x1, idx_X, 1; +--:-:-:-:1 @P4 BFE.U32 x2, idx_X, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P4 LOP.XOR x1, x1, x2; +--:-:-:-:1 @P4 IADD idx_Y, idx_Y, x1; +--:-:-:-:1 @P4 SHR.U32 idx_X, idx_X, 1; + +// Scan backwards on odd rows +// if y2 & 1: +// gx = gridX - gx - 1 +--:-:-:-:1 LOP.AND.NZ P5, RZ, idx_Y2, 1; +--:-:-:-:1 @P5 IADD idx_X, -idx_X, param_GX; +--:-:-:-:1 @P5 IADD idx_X, idx_X, -1; + +--:6:-:-:1 @P0 STS [addr_idx_Y], idx_Y; +--:6:-:-:1 @P0 STS [addr_idx_X], idx_X; +--:6:-:-:1 @P0 STS [addr_idx_K], idx_K; + +// x = gx << shiftX +// y = gy << shiftY +--:-:-:-:1 SHL gx, idx_X, param_shiftX; +--:-:-:-:1 SHL gy, idx_Y, param_shiftY; + +// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp +--:-:-:-:1 BFE.U32 super_x, tid, param_superX; +--:-:-:-:1 BFE.U32 super_y, tid, param_superY; +--:-:-:-:1 ISCADD gx, super_x, gx, 2; +--:-:-:-:1 ISCADD gy, super_y, gy, 2; + +--:-:-:-:1 LOP.AND tid32_2, tid, -32; +--:-:-:-:1 SHR.U32 tid32_2, tid32_2, 2; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid32_2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD3 readFs, readFs, tid1, tid32_2; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*36*2>, 4; + +// c = (tid & 32) >> 5 +--:-:-:-:1 BFE.U32 c, tid, 0x105; // 1 bits at position 5 + +// writeS = c*32*36 + tid & 31 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 XMAD writeS, c, 1152, tid31; +--:-:-:-:1 SHL writeS, writeS, 2; + + + +--:-:-:-:5 @P1 BRA.U FILTER_SETUP; + +############################################################## +IMAGE_SETUP: + +--:-:1:-:1 S2R idx_N, SR_CTAID.Z; + + +--:-:-:-:1 STS [writeS + 4x<32*0>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*1>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*2>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*3>], RZ; + +// n = idx_N*32 + tid & maskN +--:-:-:-:1 LOP.AND nn, tid, param_maskN; +01:-:-:-:1 ISCADD nn, idx_N, nn, 5; + +// n < N +--:-:-:-:1 ISETP.LT.AND P6, PT, nn, param_N, PT; + +// Subtract off the padding +--:-:-:-:1 IADD gx, gx, -param_pad_x; +--:-:-:-:1 IADD gy, gy, -param_pad_y; + +// offset = c*YXN + y0*XN + x0*N + n; +--:-:-:-:1 XMAD.S16.U16 offset, gx, param_N, nn; +--:-:-:-:1 XMAD.S16.U16.LO2C offset, gy, param_XN, offset; +--:-:-:-:1 XMAD.S16.U16.LO2C offset, c, param_YXN, offset; +--:-:-:-:1 ISET.LT.AND sign, offset, RZ, PT; + +--:-:-:-:1 LEA track0.CC, offset, param_I[0], [+ dshift() +]; +--:-:-:-:1 IADD.X track1, sign, param_I[1]; + +--:-:-:-:1 IADD x1, gx, 1; +--:-:-:-:1 IADD x2, gx, 2; +--:-:-:-:1 IADD x3, gx, 3; +--:-:-:-:1 IADD x4, gx, 4; +--:-:-:-:1 IADD x5, gx, 5; + +--:-:-:-:1 ISETP.LT.AND P0, PT, gx, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, x4, param_W, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, x5, param_W, PT; +--:-:-:-:1 ISETP.GE.AND P0, PT, gx, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, x4, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, x5, RZ, P5; +--:-:-:-:1 P2R mask_x, PR, RZ, 0x3f; + +--:-:-:-:1 IADD x1, gy, 1; +--:-:-:-:1 IADD x2, gy, 2; +--:-:-:-:1 IADD x3, gy, 3; +--:-:-:-:1 IADD x4, gy, 4; +--:-:-:-:1 IADD x5, gy, 5; +--:-:-:-:1 ISETP.LT.AND P0, PT, gy, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P4, PT, x4, param_Y, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, x5, param_Y, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, gy, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, x1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, x2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, x3, RZ, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, x4, RZ, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, x5, RZ, P5; + +--:-:-:-:1 SEL pred30, mask_x, RZ, P0; +--:-:-:-:1 @P1 BFI pred30, mask_x, 0x606, pred30; +--:-:-:-:1 @P2 BFI pred30, mask_x, 0x60c, pred30; +--:-:-:-:1 @P3 BFI pred30, mask_x, 0x612, pred30; +--:-:-:-:1 @P4 BFI pred30, mask_x, 0x618, pred30; +--:-:-:-:1 SEL pred36, mask_x, RZ, P5; + +// P6 = c == partialC == 1 +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, partialC, P6; +--:-:-:-:1 XMAD partC, partialC, param_YXN, RZ; +--:-:-:-:1 XMAD.PSL partialC, partialC, param_YXN.H1, partC; +--:-:-:-:1 SHL partialC, partialC, [+ dshift() +]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30; +20:-:-:-:1 @!P0 MOV i00, RZ; +--:-:-:-:1 @!P1 MOV i01, RZ; +--:-:-:-:1 @!P2 MOV i02, RZ; +--:-:-:-:1 @!P3 MOV i03, RZ; +--:-:-:-:1 @!P4 MOV i04, RZ; +--:-:-:-:1 @!P5 MOV i05, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i04, [track + [+ dsize() +]x<0*$W*$N + 4*$N>]; +--:-:1:-:1 @P5 LDG.E.CI.[+ dtype() +] i05, [track + [+ dsize() +]x<0*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @!P0 MOV i10, RZ; +--:-:-:-:1 @!P1 MOV i11, RZ; +--:-:-:-:1 @!P2 MOV i12, RZ; +--:-:-:-:1 @!P3 MOV i13, RZ; +--:-:-:-:1 @!P4 MOV i14, RZ; +--:-:-:-:1 @!P5 MOV i15, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i14, [track + [+ dsize() +]x<1*$W*$N + 4*$N>]; +--:-:2:-:1 @P5 LDG.E.CI.[+ dtype() +] i15, [track + [+ dsize() +]x<1*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @!P0 MOV i20, RZ; +--:-:-:-:1 @!P1 MOV i21, RZ; +--:-:-:-:1 @!P2 MOV i22, RZ; +--:-:-:-:1 @!P3 MOV i23, RZ; +--:-:-:-:1 @!P4 MOV i24, RZ; +--:-:-:-:1 @!P5 MOV i25, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i24, [track + [+ dsize() +]x<2*$W*$N + 4*$N>]; +--:-:3:-:1 @P5 LDG.E.CI.[+ dtype() +] i25, [track + [+ dsize() +]x<2*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 @!P0 MOV i30, RZ; +--:-:-:-:1 @!P1 MOV i31, RZ; +--:-:-:-:1 @!P2 MOV i32, RZ; +--:-:-:-:1 @!P3 MOV i33, RZ; +--:-:-:-:1 @!P4 MOV i34, RZ; +--:-:-:-:1 @!P5 MOV i35, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i34, [track + [+ dsize() +]x<3*$W*$N + 4*$N>]; +--:-:4:-:1 @P5 LDG.E.CI.[+ dtype() +] i35, [track + [+ dsize() +]x<3*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P6 SHF.L.U64 pred30, pred30, 24, pred30; +--:-:-:-:1 @!P0 MOV i40, RZ; +--:-:-:-:1 @!P1 MOV i41, RZ; +--:-:-:-:1 @!P2 MOV i42, RZ; +--:-:-:-:1 @!P3 MOV i43, RZ; +--:-:-:-:1 @!P4 MOV i44, RZ; +--:-:-:-:1 @!P5 MOV i45, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i40, [track + [+ dsize() +]x<4*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i41, [track + [+ dsize() +]x<4*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i42, [track + [+ dsize() +]x<4*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i43, [track + [+ dsize() +]x<4*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i44, [track + [+ dsize() +]x<4*$W*$N + 4*$N>]; +--:-:5:-:1 @P5 LDG.E.CI.[+ dtype() +] i45, [track + [+ dsize() +]x<4*$W*$N + 5*$N>]; + +--:-:-:-:1 @!P6 R2P PR, pred36, 0x3f; +--:-:-:-:1 @P6 R2P PR, RZ, 0x3f; +--:-:-:-:1 @!P0 MOV i50, RZ; +--:-:-:-:1 @!P1 MOV i51, RZ; +--:-:-:-:1 @!P2 MOV i52, RZ; +--:-:-:-:1 @!P3 MOV i53, RZ; +--:-:-:-:1 @!P4 MOV i54, RZ; +--:-:-:-:1 @!P5 MOV i55, RZ; +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i50, [track + [+ dsize() +]x<5*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i51, [track + [+ dsize() +]x<5*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i52, [track + [+ dsize() +]x<5*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i53, [track + [+ dsize() +]x<5*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i54, [track + [+ dsize() +]x<5*$W*$N + 4*$N>]; +--:-:6:-:1 @P5 LDG.E.CI.[+ dtype() +] i55, [track + [+ dsize() +]x<5*$W*$N + 5*$N>]; + + + +--:-:-:-:5 BAR.SYNC 0; + +3f:-:-:-:5 IADD track0.CC, track0, -partialC; +--:-:-:-:1 IADD writeS, writeS, 4x<32*36*2*2>; +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:5 BRA.U IMAGE_LOOP; + +############################################################## +FILTER_SETUP: + + +// writeS += 32*36*2*4 +--:-:-:-:1 IADD writeS, writeS, 4x<32*36*2>; + +--:-:-:-:1 MOV swapBuf, 4x<32*36*2*2>; + +// P6 = c == partialC == 1 +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, 1, PT; +--:-:-:-:1 ISETP.EQ.AND P6, PT, c, partialC, P6; +--:-:-:-:1 XMAD partC, partialC, param_RSK, RZ; +--:-:-:-:1 XMAD.PSL partialC, partialC, param_RSK.H1, partC; +--:-:-:-:1 SHL partialC, partialC, [+ dshift() +]; + +// k = idx_K*32 + tid & 31 +--:-:-:-:1 ISCADD kk, idx_K, tid31, 5; +--:-:-:-:1 ISETP.LT.AND P6, PT, kk, param_K, !P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, kk, param_K, PT; + +// a0 = c*RSK + k +--:-:-:-:1 XMAD.LO2C offset, c, param_RSK, kk; +--:-:-:-:1 LEA track0.CC, offset, param_F[0], [+ dshift() +]; +--:-:-:-:1 LEA.HI.X track1, offset, param_F[1], RZ, [+ dshift() +]; + +--:-:-:-:1 STS [writeS + 4x<32*0>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*1>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*2>], RZ; +--:-:-:-:1 STS [writeS + 4x<32*3>], RZ; + +20:-:-:-:1 @!P6 MOV f00, RZ; +--:-:-:-:1 @!P6 MOV f01, RZ; +--:-:-:-:1 @!P6 MOV f02, RZ; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f00, [track + [+ dsize() +]x<0*3*$K + 0*$K>]; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f01, [track + [+ dsize() +]x<0*3*$K + 1*$K>]; +--:-:1:-:1 @P6 LDG.E.CI.[+ dtype() +] f02, [track + [+ dsize() +]x<0*3*$K + 2*$K>]; + +--:-:-:-:1 @!P6 MOV f10, RZ; +--:-:-:-:1 @!P6 MOV f11, RZ; +--:-:-:-:1 @!P6 MOV f12, RZ; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f10, [track + [+ dsize() +]x<1*3*$K + 0*$K>]; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f11, [track + [+ dsize() +]x<1*3*$K + 1*$K>]; +--:-:2:-:1 @P6 LDG.E.CI.[+ dtype() +] f12, [track + [+ dsize() +]x<1*3*$K + 2*$K>]; + +--:-:-:-:1 @!P6 MOV f20, RZ; +--:-:-:-:1 @!P6 MOV f21, RZ; +--:-:-:-:1 @!P6 MOV f22, RZ; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f20, [track + [+ dsize() +]x<2*3*$K + 0*$K>]; +--:-:-:-:1 @P6 LDG.E.CI.[+ dtype() +] f21, [track + [+ dsize() +]x<2*3*$K + 1*$K>]; +--:5:3:-:1 @P6 LDG.E.CI.[+ dtype() +] f22, [track + [+ dsize() +]x<2*3*$K + 2*$K>]; + + + +--:-:-:-:5 BAR.SYNC 0; + +10:-:-:-:4 IADD track0.CC, track0, -partialC; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; +--:-:-:-:0 IADD.X track1, track1, -RZ; + +--:-:-:-:5 BRA.U FILTER_LOOP; + +############################################################## + +COMPUTE_SETUP: + + +--:-:-:-:1 MOV swapBuf, 4x<32*36*2*2>; + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +--:-:-:-:1 IADD tid128, tid, -128; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readFs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid128, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readIs, tid128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid16; +--:-:-:-:1 ISCADD readIs, readIs, 4x<32*4 + 32*36*2*2>, 4; + +--:-:-:-:1 LOP.AND tid_1, tid128, 1; +--:-:-:-:1 LOP.AND readFs, tid128, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 IADD3 readFs, readFs, tid16, tid_1; +--:-:-:-:0 ISCADD readFs, readFs, 4x<32*4 + 32*36*2*3>, 4; + + +--:-:-:-:5 BAR.SYNC 0; + +// Let Load loop run once to transform initial load and store to shared. +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>]; +--:-:1:-:2 LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>]; + +COMPUTE_LOOP: +[+ + my %insert = ( + + j0c33 => "--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT;\n" . + "--:-:-:-:1 IADD C, C, -2;\n", + + j0c62 => "02:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j1c63 => "--:-:-:Y:5 \@P0 BRA.U COMPUTE_LOOP;\n" . + "--:-:-:Y:5 BRA.U COMPUTE_FINISH;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 1) + { + my $odd = $j; + my $nOdd = 1 - $j; + my $rsPred = $j == 1 ? '@P0' : ' '; + my $bar = $j == 0 ? '2' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd; + + $insert{"j${j}c31"} = sprintf "--:%s:1:-:1 %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd; + + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + #$stall = '4' if $stall && $c % 2 == 0 && $j == 0 && $c > 16; + + my $yield = $c % 5 == 0 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +IMAGE_LOOP: + +[+ + our $convert_in; return $convert_in ? q{ +01:-:-:-:1 F2F.F32.F16 i00, i00; +--:-:-:-:1 F2F.F32.F16 i01, i01; +--:-:-:-:1 F2F.F32.F16 i02, i02; +--:-:-:-:1 F2F.F32.F16 i03, i03; +--:-:-:-:1 F2F.F32.F16 i04, i04; +--:-:1:-:1 F2F.F32.F16 i05, i05; + +02:-:-:-:1 F2F.F32.F16 i10, i10; +--:-:-:-:1 F2F.F32.F16 i11, i11; +--:-:-:-:1 F2F.F32.F16 i12, i12; +--:-:-:-:1 F2F.F32.F16 i13, i13; +--:-:-:-:1 F2F.F32.F16 i14, i14; +--:-:2:-:1 F2F.F32.F16 i15, i15; + +04:-:-:-:1 F2F.F32.F16 i20, i20; +--:-:-:-:1 F2F.F32.F16 i21, i21; +--:-:-:-:1 F2F.F32.F16 i22, i22; +--:-:-:-:1 F2F.F32.F16 i23, i23; +--:-:-:-:1 F2F.F32.F16 i24, i24; +--:-:3:-:1 F2F.F32.F16 i25, i25; + +08:-:-:-:1 F2F.F32.F16 i30, i30; +--:-:-:-:1 F2F.F32.F16 i31, i31; +--:-:-:-:1 F2F.F32.F16 i32, i32; +--:-:-:-:1 F2F.F32.F16 i33, i33; +--:-:-:-:1 F2F.F32.F16 i34, i34; +--:-:4:-:1 F2F.F32.F16 i35, i35; + +10:-:-:-:1 F2F.F32.F16 i40, i40; +--:-:-:-:1 F2F.F32.F16 i41, i41; +--:-:-:-:1 F2F.F32.F16 i42, i42; +--:-:-:-:1 F2F.F32.F16 i43, i43; +--:-:-:-:1 F2F.F32.F16 i44, i44; +--:-:5:-:1 F2F.F32.F16 i45, i45; + +20:-:-:-:1 F2F.F32.F16 i50, i50; +--:-:-:-:1 F2F.F32.F16 i51, i51; +--:-:-:-:1 F2F.F32.F16 i52, i52; +--:-:-:-:1 F2F.F32.F16 i53, i53; +--:-:-:-:1 F2F.F32.F16 i54, i54; +--:-:6:-:2 F2F.F32.F16 i55, i55; + } : ''; ++] + +[+ + my $out; + foreach my $i (0 .. 5) + { + my $w = $i == 0 ? '3f' : '--'; + $out .= qq{ +$w:-:-:-:1 FFMA ti4, i2$i, -2.640625, i4$i; +--:-:-:-:1 FFMA ti5, i3$i, -2.640625, i5$i; +--:-:-:-:1 FFMA ti0, i2$i, -2.25, i4$i; +--:-:-:-:1 FFMA ti1, i1$i, -2.25, i3$i; +--:-:-:-:1 FFMA ti2, i2$i, -0.390625, i4$i; +--:-:-:-:1 FFMA ti3, i1$i, -0.390625, i3$i; +--:-:-:-:1 FFMA TI0$i, i0$i, 0.87890625, ti4; +--:-:-:-:1 FFMA TI5$i, i1$i, 0.87890625, ti5; +--:-:-:-:1 FFMA TI1$i, ti1, 0.625, ti0; +--:-:-:-:1 FFMA TI2$i, ti1, -0.625, ti0; +--:-:-:-:1 FFMA TI3$i, ti3, 1.5, ti2; +--:-:-:-:1 FFMA TI4$i, ti3, -1.5, ti2; + }; + } + return $out; ++] + +--:-:-:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; + +--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT; + +--:-:-:-:1 IADD track0.CC, track0, param_YXN2p; +--:-:-:-:1 IADD.X track1, track1, RZ; + +//--:-:-:-:1 LOP32I.AND pred30, pred30, 0xffffff; +--:-:-:-:1 @!P0 BFI pred36, RZ, 0x600, pred36; +--:-:-:-:1 @!P0 MOV pred30, RZ; + +--:-:-:-:1 R2P PR, pred30, 0x3f; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; + +[+ + my $out; + foreach my $i (0 .. 5) + { + $out .= qq{ +--:-:-:-:1 FFMA ti4, TI${i}2, -2.640625, TI${i}4; +--:-:-:-:1 FFMA ti5, TI${i}3, -2.640625, TI${i}5; +--:-:-:-:1 FFMA ti0, TI${i}2, -2.25, TI${i}4; +--:-:-:-:1 FFMA ti1, TI${i}1, -2.25, TI${i}3; +--:-:-:-:1 FFMA ti2, TI${i}2, -0.390625, TI${i}4; +--:-:-:-:1 FFMA ti3, TI${i}1, -0.390625, TI${i}3; +--:-:-:-:1 FFMA I${i}0, TI${i}0, 0.87890625, ti4; +--:-:-:-:1 FFMA I${i}5, TI${i}1, 0.87890625, ti5; +--:-:-:-:1 FFMA I${i}1, ti1, 0.625, ti0; +--:-:-:-:1 FFMA I${i}2, ti1, -0.625, ti0; +--:-:-:-:1 FFMA I${i}3, ti3, 1.5, ti2; +--:-:-:-:1 FFMA I${i}4, ti3, -1.5, ti2; + }; + } + return $out; ++] + +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 3)>], I03; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 4)>], I04; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 0)>], I00; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 5)>], I05; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 1)>], I01; +--:1:-:-:1 STS [writeS + 4x<32*(0*6 + 2)>], I02; + + +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 0)>], I10; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 5)>], I15; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 3)>], I13; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 1)>], I11; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 2)>], I12; +--:2:-:-:1 STS [writeS + 4x<32*(1*6 + 4)>], I14; + +01:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i04, [track + [+ dsize() +]x<0*$W*$N + 4*$N>]; +--:-:1:-:1 @P5 LDG.E.CI.[+ dtype() +] i05, [track + [+ dsize() +]x<0*$W*$N + 5*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i00, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i01, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i02, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i03, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i04, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i05, RZ; +--:-:-:-:1 R2P PR, pred30, 0x3f; + +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 0)>], I20; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 5)>], I25; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 1)>], I21; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 2)>], I22; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 3)>], I23; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:3:-:-:1 STS [writeS + 4x<32*(2*6 + 4)>], I24; + +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 0)>], I30; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 5)>], I35; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 1)>], I31; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 2)>], I32; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 3)>], I33; +--:4:-:-:1 STS [writeS + 4x<32*(3*6 + 4)>], I34; + +02:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i14, [track + [+ dsize() +]x<1*$W*$N + 4*$N>]; +--:-:2:-:1 @P5 LDG.E.CI.[+ dtype() +] i15, [track + [+ dsize() +]x<1*$W*$N + 5*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i10, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i11, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i12, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i13, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i14, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i15, RZ; + +--:-:-:-:5 R2P PR, pred30, 0x3f; // FORCE + +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 0)>], I40; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 5)>], I45; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 1)>], I41; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 2)>], I42; + + + + + +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 3)>], I43; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 4)>], I44; +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; + +--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>]; +--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>]; + +04:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i24, [track + [+ dsize() +]x<2*$W*$N + 4*$N>]; +--:-:3:-:1 @P5 LDG.E.CI.[+ dtype() +] i25, [track + [+ dsize() +]x<2*$W*$N + 5*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i20, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i21, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i22, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i23, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i24, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i25, RZ; +--:-:-:-:6 R2P PR, pred30, 0x3f; // FORCE + +--:-:-:-:1 SHF.R.U64 pred30, pred30, 6, pred30; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 0)>], I50; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 5)>], I55; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 1)>], I51; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 2)>], I52; +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 3)>], I53; +--:6:-:-:1 STS [writeS + 4x<32*(5*6 + 4)>], I54; + +08:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i34, [track + [+ dsize() +]x<3*$W*$N + 4*$N>]; +--:-:4:-:1 @P5 LDG.E.CI.[+ dtype() +] i35, [track + [+ dsize() +]x<3*$W*$N + 5*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i30, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i31, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i32, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i33, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i34, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i35, RZ; +--:-:-:-:c R2P PR, pred30, 0x3f; // FORCE + +--:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i40, [track + [+ dsize() +]x<4*$W*$N + 0*$N>]; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i41, [track + [+ dsize() +]x<4*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i42, [track + [+ dsize() +]x<4*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i43, [track + [+ dsize() +]x<4*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i44, [track + [+ dsize() +]x<4*$W*$N + 4*$N>]; +--:-:5:-:1 @P5 LDG.E.CI.[+ dtype() +] i45, [track + [+ dsize() +]x<4*$W*$N + 5*$N>]; +--:-:-:-:1 SHF.L.U64 pred30, pred30, 24, pred30; + +--:-:-:-:1 @!P0 I2I.U32.U32 i40, RZ; +--:-:-:-:1 @!P1 I2I.U32.U32 i41, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i42, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i43, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i44, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i45, RZ; +--:-:-:-:a R2P PR, pred36, 0x3f; // FORCE + +20:-:-:-:1 @P0 LDG.E.CI.[+ dtype() +] i50, [track + [+ dsize() +]x<5*$W*$N + 0*$N>]; +--:-:-:-:1 @!P0 I2I.U32.U32 i50, RZ; +--:-:-:-:1 @P1 LDG.E.CI.[+ dtype() +] i51, [track + [+ dsize() +]x<5*$W*$N + 1*$N>]; +--:-:-:-:1 @P2 LDG.E.CI.[+ dtype() +] i52, [track + [+ dsize() +]x<5*$W*$N + 2*$N>]; +--:-:-:-:1 @P3 LDG.E.CI.[+ dtype() +] i53, [track + [+ dsize() +]x<5*$W*$N + 3*$N>]; +--:-:-:-:1 @P4 LDG.E.CI.[+ dtype() +] i54, [track + [+ dsize() +]x<5*$W*$N + 4*$N>]; +--:-:6:-:1 @P5 LDG.E.CI.[+ dtype() +] i55, [track + [+ dsize() +]x<5*$W*$N + 5*$N>]; +--:-:-:-:1 @!P1 I2I.U32.U32 i51, RZ; +--:-:-:-:1 @!P2 I2I.U32.U32 i52, RZ; +--:-:-:-:1 @!P3 I2I.U32.U32 i53, RZ; +--:-:-:-:1 @!P4 I2I.U32.U32 i54, RZ; +--:-:-:-:1 @!P5 I2I.U32.U32 i55, RZ; + + + +[+ + our ($vsize, $dsize, $convert_in); + my %insert = ( + j0c15 => "--:-:5:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 && $j == 1 ? "10" : '--'; + + my $ctrl = "$wait:-:-:-:1"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + +--:-:-:-:1 LOP.AND.Z P0, RZ, pred36, 0x100; +--:-:-:-:1 LOP.XOR pred36, pred36, 0x100; + +--:-:-:-:1 ISETP.GT.AND P1, PT, C, 0, PT; + +--:-:-:-:1 @P0 MOV32I swapBuff, 4x<32*36*2*2>; + + +--:-:-:-:1 @!P0 MOV32I swapBuff, -4x<32*36*2*2>; +--:-:-:-:0 IADD C, C, -2; +--:-:-:Y:5 BAR.SYNC 0; +--:-:-:-:1 IADD readFs, readFs, swapBuff; +--:-:-:-:1 IADD readIs, readIs, swapBuff; +--:-:-:-:1 IADD writeS, writeS, -swapBuff; +--:-:-:Y:5 @P1 BRA.U IMAGE_LOOP; +--:-:-:Y:5 BRA.U LOAD_FINISH; + +FILTER_LOOP: + +[+ + our $convert_in; return $convert_in ? q{ +01:-:-:-:1 F2F.F32.F16 f00, f00; +--:-:-:-:1 F2F.F32.F16 f01, f01; +--:-:1:-:1 F2F.F32.F16 f02, f02; + +02:-:-:-:1 F2F.F32.F16 f10, f10; +--:-:-:-:1 F2F.F32.F16 f11, f11; +--:-:2:-:1 F2F.F32.F16 f12, f12; + +04:-:-:-:1 F2F.F32.F16 f20, f20; +--:-:-:-:1 F2F.F32.F16 f21, f21; +--:-:3:-:1 F2F.F32.F16 f22, f22; + } : ''; ++] + +--:-:-:-:1 MOV32I rcp6, 0.688403361344538; +--:-:-:-:1 MOV32I rcp8, 0.430252100840336; +--:-:-:-:1 MOV32I rcp24, 0.119514472455649; +--:-:-:-:1 MOV32I rcp12, 0.179271708683473; +07:-:-:-:1 FMUL32I tf00, f20, 0.26890756302521; +--:-:-:-:1 FMUL32I tf01, f21, 0.26890756302521; +--:-:-:-:1 FMUL32I tf02, f22, 0.26890756302521; +--:-:-:-:1 FFMA tf10, f00, -rcp6, -tf00; +--:-:-:-:1 FFMA tf20, f00, rcp24, tf00; +--:-:-:-:1 FFMA tf11, f01, -rcp6, -tf01; +--:-:-:-:1 FFMA tf21, f01, rcp24, tf01; +--:-:-:-:1 FFMA tf12, f02, -rcp6, -tf02; +--:-:-:-:1 FFMA tf22, f02, rcp24, tf02; + +--:-:-:-:1 FMUL32I TF00, f00, 1.13777777777778; +--:-:-:-:1 FFMA TF10, f10, -rcp8, tf10; +--:-:-:-:1 FFMA TF20, f10, rcp8, tf10; +--:-:-:-:1 FFMA TF30, f10, rcp12, tf20; +--:-:-:-:1 FFMA TF40, f10, -rcp12, tf20; +//--:-:-:-:1 MOV TF50, f20; + +--:-:-:-:1 FMUL32I TF02, f02, 1.13777777777778; +--:-:-:-:1 FFMA TF12, f12, -rcp8, tf12; +--:-:-:-:1 FFMA TF22, f12, rcp8, tf12; +--:-:-:-:1 FFMA TF32, f12, rcp12, tf22; +--:-:-:-:1 FFMA TF42, f12, -rcp12, tf22; +//--:-:-:-:1 MOV TF52, f22; + +--:-:-:-:1 FMUL32I TF01, f01, 1.13777777777778; +--:-:-:-:1 FFMA TF11, f11, -rcp8, tf11; +--:-:-:-:1 FFMA TF21, f11, rcp8, tf11; +--:-:-:-:1 FFMA TF31, f11, rcp12, tf21; +--:-:-:-:1 FFMA TF41, f11, -rcp12, tf21; +//--:-:-:-:1 MOV TF51, f21; + +--:-:-:-:1 FMUL32I ff00, TF02, 0.26890756302521; +--:-:-:-:1 FMUL32I ff10, TF12, 0.26890756302521; +--:-:-:-:1 FMUL32I ff20, TF22, 0.26890756302521; +--:-:-:-:1 FMUL32I ff30, TF32, 0.26890756302521; +--:-:-:-:1 FMUL32I ff40, TF42, 0.26890756302521; +--:-:-:-:1 FMUL32I ff50, TF52, 0.26890756302521; +--:-:-:-:1 FFMA ff01, TF00, -rcp6, -ff00; +--:-:-:-:1 FFMA ff02, TF00, rcp24, ff00; +--:-:-:-:1 FFMA ff11, TF10, -rcp6, -ff10; +--:-:-:-:1 FFMA ff12, TF10, rcp24, ff10; +--:-:-:-:1 FFMA ff21, TF20, -rcp6, -ff20; +--:-:-:-:1 FFMA ff22, TF20, rcp24, ff20; +--:-:-:-:1 FFMA ff31, TF30, -rcp6, -ff30; +--:-:-:-:1 FFMA ff32, TF30, rcp24, ff30; +--:-:-:-:1 FFMA ff41, TF40, -rcp6, -ff40; +--:-:-:-:1 FFMA ff42, TF40, rcp24, ff40; +--:-:-:-:1 FFMA ff51, TF50, -rcp6, -ff50; +--:-:-:-:1 FFMA ff52, TF50, rcp24, ff50; + +--:-:-:-:1 FMUL32I F00, TF00, 1.13777777777778; +--:-:-:-:1 FFMA F01, TF01, -rcp8, ff01; +--:-:-:-:1 FFMA F02, TF01, rcp8, ff01; +--:-:-:-:1 FFMA F03, TF01, rcp12, ff02; +--:-:-:-:1 FFMA F04, TF01, -rcp12, ff02; +//--:-:-:-:1 MOV F05, TF02; + +--:-:-:-:1 FMUL32I F10, TF10, 1.13777777777778; +--:-:-:-:1 FFMA F11, TF11, -rcp8, ff11; +--:-:-:-:1 FFMA F12, TF11, rcp8, ff11; +--:-:-:-:1 FFMA F13, TF11, rcp12, ff12; +--:-:-:-:1 FFMA F14, TF11, -rcp12, ff12; +//--:-:-:-:1 MOV F15, TF12; + +--:-:-:-:1 FMUL32I F20, TF20, 1.13777777777778; +--:-:-:-:1 FFMA F21, TF21, -rcp8, ff21; +--:-:-:-:1 FFMA F22, TF21, rcp8, ff21; +--:-:-:-:1 FFMA F23, TF21, rcp12, ff22; +--:-:-:-:1 FFMA F24, TF21, -rcp12, ff22; +//--:-:-:-:1 MOV F25, TF22; + +--:-:-:-:1 FMUL32I F30, TF30, 1.13777777777778; +--:-:-:-:1 FFMA F31, TF31, -rcp8, ff31; +--:-:-:-:1 FFMA F32, TF31, rcp8, ff31; +--:-:-:-:1 FFMA F33, TF31, rcp12, ff32; +--:-:-:-:1 FFMA F34, TF31, -rcp12, ff32; +//--:-:-:-:1 MOV F35, TF32; + +--:-:-:-:1 FMUL32I F40, TF40, 1.13777777777778; +--:-:-:-:1 FFMA F41, TF41, -rcp8, ff41; +--:-:-:-:1 FFMA F42, TF41, rcp8, ff41; +--:-:-:-:1 FFMA F43, TF41, rcp12, ff42; +--:-:-:-:1 FFMA F44, TF41, -rcp12, ff42; +//--:-:-:-:1 MOV F45, TF42; + +--:-:-:-:1 FMUL32I F50, TF50, 1.13777777777778; +--:-:-:-:1 FFMA F51, TF51, -rcp8, ff51; +--:-:-:-:1 FFMA F52, TF51, rcp8, ff51; +--:-:-:-:1 FFMA F53, TF51, rcp12, ff52; +--:-:-:-:1 FFMA F54, TF51, -rcp12, ff52; +//--:-:-:-:1 MOV F55, TF52; + +--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, P2; +--:-:-:-:1 ISETP.GT.AND P1, PT, C, 0, PT; +--:-:-:-:1 IADD C, C, -2; + +--:-:-:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:6:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; + +--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 5)>], F55; + +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 0)>], F00; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 1)>], F01; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 2)>], F02; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 3)>], F03; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 4)>], F04; +--:-:-:-:1 STS [writeS + 4x<32*(0*6 + 5)>], F05; + +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 0)>], F10; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 1)>], F11; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 2)>], F12; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 3)>], F13; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 4)>], F14; +--:-:-:-:1 STS [writeS + 4x<32*(1*6 + 5)>], F15; + +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 0)>], F20; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 1)>], F21; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 2)>], F22; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 3)>], F23; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 4)>], F24; +--:-:-:-:1 STS [writeS + 4x<32*(2*6 + 5)>], F25; + +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 0)>], F30; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 1)>], F31; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 2)>], F32; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 3)>], F33; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 4)>], F34; +--:-:-:-:1 STS [writeS + 4x<32*(3*6 + 5)>], F35; + +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 0)>], F40; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 1)>], F41; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 2)>], F42; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 3)>], F43; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 4)>], F44; +--:-:-:-:1 STS [writeS + 4x<32*(4*6 + 5)>], F45; + + + +20:-:-:-:1 IADD track0.CC, track0, param_RSK2p; +--:-:-:-:1 IADD.X track1, track1, RZ; + +[+ + our ($dtype, $dsize, $SK, $K); + my %insert = ( + j0c0 => "--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n", + j0c1 => "--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n", + j0c15 => "--:-:5:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + + j0c5 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 0)>], F50;\n", + j0c7 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 1)>], F51;\n", + j0c9 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 2)>], F52;\n", + j0c11 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 3)>], F53;\n", + j0c13 => "--:-:-:-:1 STS [writeS + 4x<32*(5*6 + 4)>], F54;\n", + + j1c1 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n", + j1c2 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n", + j1c3 => "--:-:1:-:1 \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n", + + j1c4 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n", + j1c5 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n", + j1c6 => "--:-:2:-:1 \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n", + + j1c7 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n", + j1c8 => "--:-:-:-:1 \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n", + j1c9 => "--:-:3:-:1 \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 && $j == 1 ? "10" : '--'; + + my $ctrl = "$wait:-:-:-:1"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + + + +--:-:-:-:1 IADD readFs, readFs, -swapBuf; +--:-:-:-:1 IADD readIs, readIs, -swapBuf; +--:-:-:-:0 IADD writeS, writeS, swapBuf; +--:-:-:Y:5 BAR.SYNC 0; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; +--:-:-:Y:5 @P1 BRA.U FILTER_LOOP; + + +LOAD_FINISH: + +[- + our $trans1 = "0.244140625"; + our $trans2 = "0.625"; + our $trans3 = "0.390625"; +-] + + diff --git a/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_X.sass b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_X.sass new file mode 100644 index 0000000..15a0f0b --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_X.sass @@ -0,0 +1,687 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $type; +our $dtype = $type eq 'h' ? 'U16' : '32'; +our $convert_in = $type eq 'h' ? 'F2F.F32.F16' : ''; +our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : ''; +our $dshift = $type eq 'h' ? '1' : '2'; +our $dsize = $type eq 'h' ? '2' : '4'; +our $vsize = $type eq 'h' ? '64' : '128'; +sub dtype { return $dtype; } +sub dsize { return $dsize; } +sub dshift { return $dshift; } +sub vsize { return $vsize; } +-] + + + + addr_zero : 4x<32*36*2*4 + 64 + 0> + addr_idx_Y : 4x<32*36*2*4 + 64 + 4> + addr_idx_X : 4x<32*36*2*4 + 64 + 5> + addr_idx_K : 4x<32*36*2*4 + 64 + 6> + + param_S[0] : c[0x0][0x140] + param_S[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_C : c[0x0][0x174] + param_K : c[0x0][0x178] + param_N : c[0x0][0x17c] + param_Xk : c[0x0][0x180] + param_k : c[0x0][0x184] + param_magic_Xk : c[0x0][0x188] + param_shift_Xk : c[0x0][0x18c] + param_magic_k : c[0x0][0x190] + param_shift_k : c[0x0][0x194] + param_C_1152 : c[0x0][0x198] + param_GXS_C_1152 : c[0x0][0x19c] + param_GYS_GXS_C_1152 : c[0x0][0x1a0] + param_P : c[0x0][0x1a4] + param_Q : c[0x0][0x1a8] + param_QN : c[0x0][0x1ac] + param_PQN : c[0x0][0x1b0] + param_PQN15 : c[0x0][0x1b4] + param_maskN : c[0x0][0x1b8] + param_shiftX : c[0x0][0x1bc] + param_shiftY : c[0x0][0x1c0] + param_superX : c[0x0][0x1c4] + param_superY : c[0x0][0x1c8] + param_gridN : c[0x0][0x1cc] + param_gridQN : c[0x0][0x1d0] + param_gridPQN : c[0x0][0x1d4] + + + + + + 0-63 : czero<00-63> + + 3, 2,11,10 : clx<0-3>y0 + 7, 6,15,14 : clx<0-3>y1 + 1, 0, 9, 8 : clx<0-3>y2 + 5, 4,13,12 : clx<0-3>y3 + 19,18,27,26 : clx<0-3>y4 + 23,22,31,30 : clx<0-3>y5 + 17,16,25,24 : clx<0-3>y6 + 21,20,29,28 : clx<0-3>y7 + + 32-43 : jl0Ix<0-3>, jl0Fy<0-7> + 44-51 : jl1Ix<0-3>, jl1Fy<4-7> + 36-39 : jl1Fy<0-3> + + 52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3> + 88-89 : track<0-1> + 90-91 ~ writeS + + 32-39 ~ partialC, idx_K, idx_Y, idx_X + 40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, c, offset, idx_N + + 32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1 + 48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16 + + + 3, 2,11,10,19,18,27,26 : ccx<0-7>y0 + 7, 6,15,14,23,22,31,30 : ccx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2 + 5, 4,13,12,21,20,29,28 : ccx<0-7>y3 + 35,34,43,42,51,50,59,58 : ccx<0-7>y4 + 39,38,47,46,55,54,63,62 : ccx<0-7>y5 + 33,32,41,40,49,48,57,56 : ccx<0-7>y6 + 37,36,45,44,53,52,61,60 : ccx<0-7>y7 + + 64-79 : jc0Ix<0-7>, jc0Fy<0-7> + 80-91 : jc1Ix<4-7>, jc1Fy<0-7> + 64-67 : jc1Ix<0-3> + + 64-86 ~ tid16, tid_1, tid128 + + 87 = tid + 92-95 ~ C, swapBuf, readFs, readIs + + 64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q + 86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset + + 64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1 + + // t00 80 r00 78 + // t10 m10 r01 w01 + // t20 m20 r02 w02 + // t30 m30 r03 w03 + // w00 m00 s00 w00 + // w30 m40 s01 w01 + // w10 m10 s02 w02 + // w20 m20 s03 w04 + + 78 = t0<0-5>, r<0-3>0 + 79 = temp + + 3, 2,11,10,19,18 : m<0-5>0 + 1, 9, 0, 8,17,16 : m<0-5>1 + 27,26,25,24,64,65 : m<0-5>2 + 2,11,10 : t10, t20, t30 + 9, 0, 8 : t11, t21, t31 + 26,25,24 : t12, t22, t32 + 3, 2,11,19 : w00, w10, w20, w30 + 1, 9, 0,17 : w01, w11, w21, w31 + 27,26,25,64 : w02, w12, w22, w32 + + 66,67,68,69,70,71 : m<0-5>3 + 72,73,74,75,76,77 : m<0-5>4 + 8,24,10,65,16,18 : m<0-5>5 + 67,68,69 : t13, t23, t33 + 73,74,75 : t14, t24, t34 + 24,10,65 : t15, t25, t35 + 66,67,68,70 : w03, w13, w23, w33 + 72,73,74,76 : w04, w14, w24, w34 + 8,24,10,16 : w05, w15, w25, w35 + + 1,27,66 : r01, r02, r03 + 9,26,67 : r11, r12, r13 + 0,25,68 : r21, r22, r23 + 17,64,70 : r31, r32, r33 + 3, 1,27,72 : s00, s01, s02, s03 + 2, 9,26,73 : s10, s11, s12, s13 + 11, 0,25,74 : s20, s21, s22, s23 + 19,17,64,76 : s30, s31, s32, s33 + + 80-83 ~ xx<0-3> + 78-81 ~ sum<0-3> + 82-83 : Sum<0-1> + 84-85 : Out<0-1> + + 8,10,16,18 ~ b0<0-3> + 24,65,66,67 ~ b1<0-3> + 68,69,70,71 ~ b2<0-3> + 75,77,78,79 ~ b3<0-3> + + + +--:-:-:-:0 MOV C, param_C; +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:-:-:1 MOV swapBuf, 4x<32*36*2*2>; +01:-:-:-:0 ISETP.GE.AND P0, PT, tid, 128, PT; +--:-:-:-:1 STS.128 [addr_zero], RZ; +--:-:-:Y:c LOP.AND partialC, C, 1; +--:-:-:-:0 IADD C, C, partialC; +--:-:-:-:5 @P0 BRA.U COMPUTE_SETUP; + +############################################################## +LOAD_SETUP: + +--:-:1:-:1 S2R idx_YXk, SR_CTAID.X; +--:-:2:-:1 S2R idx_K, SR_CTAID.Y; + + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +] + +--:-:-:-:1 ISETP.EQ.AND P0, PT, tid, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P1, PT, tid, 64, PT; + +// idx_Y = idx_YXk / blk_Xk +--:-:-:-:1 MOV magic_Xk, param_magic_Xk; +--:-:-:-:1 IADD negXk, RZ, -param_Xk; +--:-:-:-:1 ISETP.NE.AND P3, PT, magic_Xk, 1, PT; +01:-:-:-:1 @P3 XMAD div1, idx_YXk, magic_Xk, RZ; +--:-:-:-:1 @P3 XMAD div2, idx_YXk, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD div3, idx_YXk.H1, magic_Xk.H1, RZ; +--:-:-:-:1 @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk, div1; +--:-:-:-:1 @P3 IADD3.RS idx_Y, div1, div2, div3; +--:-:-:-:1 @P3 SHR.U32 idx_Y, idx_Y, param_shift_Xk; +--:-:-:-:1 @!P3 SHR.U32 idx_Y, idx_YXk, param_shift_Xk; + +// idx_Xk = idx_YXk % blk_Xk +--:-:-:-:1 XMAD.LO2 idx_Xk, negXk, idx_Y, idx_YXk; + +// idx_X = idx_Xk / blk_k +// idx_k = idx_Xk % blk_k +--:-:-:-:1 XMAD idx_X, idx_Xk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_X, idx_X, param_shift_k; +--:-:-:-:1 XMAD idx_k, idx_X, param_k, RZ; +--:-:-:-:1 IADD idx_k, -idx_k, idx_Xk; + +// idx_K = idx_K * blk_k + idx_k +02:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +--:-:-:-:1 @P0 STS [addr_idx_Y], idx_Y; +--:-:-:-:1 @P0 STS [addr_idx_X], idx_X; +--:-:-:-:1 @P0 STS [addr_idx_K], idx_K; + + +--:-:-:-:1 LOP.AND tid32_2, tid, -32; +--:-:-:-:1 SHR.U32 tid32_2, tid32_2, 2; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) +--:-:-:-:1 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid32_2; +--:-:-:-:1 SHL readIs, readIs, 4; + +// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readFs, tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD3 readFs, readFs, tid1, tid32_2; +--:-:-:-:1 ISCADD readFs, readFs, 4x<32*36*2>, 4; + +// c = (tid & 63) >> 5 +--:-:-:-:1 BFE.U32 c, tid, 0x105; // 2 bits at position 5 + +// partialC = (2 - partialC) +// P6 = c < partialC +// partialC *= 32*36 * itemsize +--:-:-:-:1 IADD partialC, -partialC, 2; +--:-:-:-:1 ISETP.LT.AND P6, PT, c, partialC, PT; +--:-:-:-:1 XMAD partialC, partialC, 1x<32*36 * $dsize>, RZ; + +// writeS = (c*32*36 + (tid & 31)*4)*4 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL writeS, tid31, 4; +--:-:-:-:1 XMAD writeS, c, 4x<32*36>, writeS; + +// offset = c*32*36 + tid31*4 +--:-:-:-:1 SHL tid31, tid31, 2; +--:-:-:-:1 XMAD offset, c, 1x<32*36>, tid31; + + +// P5 = C > 2 +--:-:-:-:1 ISETP.GT.AND P5, PT, C, 2, PT; + + +--:-:-:-:5 @P1 BRA.U FILTER_SETUP; + +############################################################## +IMAGE_SETUP: + +--:-:1:-:1 S2R idx_N, SR_CTAID.Z; + +// (GN,GYS,GXS,C,6,6,32) +// offset += (idx_N*GYS*GXS*C*32*36 + idx_Y*GXS*C*32*36 + idx_X*C*32*36) * itemsize; +--:-:-:-:1 XMAD.LO2C offset, idx_X, param_C_1152, offset; +--:-:-:-:1 XMAD.LO2C offset, idx_Y, param_GXS_C_1152, offset; +01:-:-:-:1 XMAD.LO2C offset, idx_N, param_GYS_GXS_C_1152, offset; +--:-:-:-:1 LEA track0.CC, offset, param_I[0], [+ dshift() +]; +--:-:-:-:0 LEA.HI.X track1, offset, param_I[1], RZ, [+ dshift() +]; + + +--:-:-:-:5 BRA.U LOAD; + +############################################################## +FILTER_SETUP: + + +// writeS += 32*36*2*4 +--:-:-:-:1 IADD writeS, writeS, 4x<32*36*2>; + +// (kBlks,C,6,6,32) +// offset += (idx_K*C*32*36) * itemsize; +--:-:-:-:1 XMAD.LO2C offset, idx_K, param_C_1152, offset; +--:-:-:-:1 LEA track0.CC, offset, param_F[0], [+ dshift() +]; +--:-:-:-:2 LEA.HI.X track1, offset, param_F[1], RZ, [+ dshift() +]; + + +############################################################## +LOAD: + +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>]; +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>]; +--:-:2:-:1 @P6 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T0, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T1, [addr_zero]; +--:-:2:-:1 @!P6 LDS.U.[+ vsize() +] T2, [addr_zero]; + +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>]; +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>]; +--:-:3:-:1 @P6 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T3, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T4, [addr_zero]; +--:-:3:-:1 @!P6 LDS.U.[+ vsize() +] T5, [addr_zero]; + +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>]; +--:-:-:-:1 @P6 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>]; +--:-:4:-:1 @P6 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>]; + +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T6, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T7, [addr_zero]; +--:-:4:-:1 @!P6 LDS.U.[+ vsize() +] T8, [addr_zero]; + +[+ + our $convert_in; + return $convert_in ? q{ + +02:-:-:-:1 F2F.F32.F16 T03, T01.H1; +--:-:-:-:1 F2F.F32.F16 T02, T01.H0; +--:-:-:-:1 F2F.F32.F16 T01, T00.H1; +--:-:2:-:1 F2F.F32.F16 T00, T00.H0; + +--:-:-:-:1 F2F.F32.F16 T13, T11.H1; +--:-:-:-:1 F2F.F32.F16 T12, T11.H0; +--:-:-:-:1 F2F.F32.F16 T11, T10.H1; +--:-:5:-:1 F2F.F32.F16 T10, T10.H0; + +--:-:-:-:1 F2F.F32.F16 T23, T21.H1; +--:-:-:-:1 F2F.F32.F16 T22, T21.H0; +--:-:-:-:1 F2F.F32.F16 T21, T20.H1; +--:-:6:-:1 F2F.F32.F16 T20, T20.H0; + +02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0; + +04:-:-:-:1 F2F.F32.F16 T33, T31.H1; +--:-:-:-:1 F2F.F32.F16 T32, T31.H0; +--:-:-:-:1 F2F.F32.F16 T31, T30.H1; +--:-:3:-:1 F2F.F32.F16 T30, T30.H0; + +10:-:-:-:1 STS.128 [writeS + 4x<1*32*4>], T1; + +--:-:-:-:1 F2F.F32.F16 T43, T41.H1; +--:-:-:-:1 F2F.F32.F16 T42, T41.H0; +--:-:-:-:1 F2F.F32.F16 T41, T40.H1; +--:-:5:-:1 F2F.F32.F16 T40, T40.H0; + +20:-:-:-:1 STS.128 [writeS + 4x<2*32*4>], T2; + +--:-:-:-:1 F2F.F32.F16 T53, T51.H1; +--:-:-:-:1 F2F.F32.F16 T52, T51.H0; +--:-:-:-:1 F2F.F32.F16 T51, T50.H1; +--:-:6:-:1 F2F.F32.F16 T50, T50.H0; + +04:-:-:-:1 STS.128 [writeS + 4x<3*32*4>], T3; + +08:-:-:-:1 F2F.F32.F16 T63, T61.H1; +--:-:-:-:1 F2F.F32.F16 T62, T61.H0; +--:-:-:-:1 F2F.F32.F16 T61, T60.H1; +--:-:4:-:1 F2F.F32.F16 T60, T60.H0; + +10:-:-:-:1 STS.128 [writeS + 4x<4*32*4>], T4; + +--:-:-:-:1 F2F.F32.F16 T73, T71.H1; +--:-:-:-:1 F2F.F32.F16 T72, T71.H0; +--:-:-:-:1 F2F.F32.F16 T71, T70.H1; +--:-:5:-:1 F2F.F32.F16 T70, T70.H0; + +20:-:-:-:1 STS.128 [writeS + 4x<5*32*4>], T5; + +--:-:-:-:1 F2F.F32.F16 T83, T81.H1; +--:-:-:-:1 F2F.F32.F16 T82, T81.H0; +--:-:-:-:1 F2F.F32.F16 T81, T80.H1; +--:-:6:-:1 F2F.F32.F16 T80, T80.H0; + +08:-:-:-:1 STS.128 [writeS + 4x<6*32*4>], T6; +10:-:-:-:1 STS.128 [writeS + 4x<7*32*4>], T7; +20:-:-:-:1 STS.128 [writeS + 4x<8*32*4>], T8; + + } : q{ +02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0; +--:-:-:-:1 STS.128 [writeS + 4x<1*32*4>], T1; +--:-:-:-:1 STS.128 [writeS + 4x<2*32*4>], T2; +04:-:-:-:1 STS.128 [writeS + 4x<3*32*4>], T3; +--:-:-:-:1 STS.128 [writeS + 4x<4*32*4>], T4; +--:-:-:-:1 STS.128 [writeS + 4x<5*32*4>], T5; +08:-:-:-:1 STS.128 [writeS + 4x<6*32*4>], T6; +--:-:-:-:1 STS.128 [writeS + 4x<7*32*4>], T7; +--:-:-:-:1 STS.128 [writeS + 4x<8*32*4>], T8; + }; ++] + +--:-:-:-:0 IADD track0.CC, track0, partialC; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeS, writeS, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X track1, track1, RZ; + +--:-:-:-:1 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:1:-:1 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>]; + +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>]; +--:-:2:-:1 @P5 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>]; +--:-:3:-:1 @P5 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>]; +--:-:-:-:1 @P5 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>]; +--:6:4:-:1 @P5 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>]; + +--:-:-:-:5 BRA.U LOAD_LOOP; + +############################################################## + +COMPUTE_SETUP: + + +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +--:-:-:-:1 IADD tid128, tid, -128; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) +// readFs = ((tid & -16) >> 1) | ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND tid16, tid128, -16; +--:-:-:-:1 SHR.U32 tid16, tid16, 1; + +--:-:-:-:1 BFE.U32 readIs, tid128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, tid16; +--:-:-:-:1 ISCADD readIs, readIs, 4x<32*4>, 4; + +--:-:-:-:1 LOP.AND tid_1, tid128, 1; +--:-:-:-:1 LOP.AND readFs, tid128, 8; +--:-:-:-:1 SHR.U32 readFs, readFs, 2; +--:-:-:-:1 IADD3 readFs, readFs, tid16, tid_1; +--:-:-:-:0 ISCADD readFs, readFs, 4x<32*4 + 32*36*2>, 4; + + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>]; +--:-:-:-:1 LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>]; +--:-:1:-:2 LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>]; + +COMPUTE_LOOP: +[+ + my %insert = ( + + j0c33 => "--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT;\n" . + "--:-:-:-:1 IADD C, C, -2;\n", + + j0c62 => "02:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 IADD readFs, readFs, swapBuf;\n" . + "--:-:-:-:1 IADD readIs, readIs, swapBuf;\n" . + "--:-:-:-:1 IADD swapBuf, RZ, -swapBuf;\n", + + j1c63 => "--:-:-:Y:5 \@P0 BRA.U COMPUTE_LOOP;\n" . + "--:-:-:Y:5 BRA.U COMPUTE_FINISH;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 1) + { + my $odd = $j; + my $nOdd = 1 - $j; + my $rsPred = $j == 1 ? '@P0' : ' '; + my $bar = $j == 0 ? '2' : '-'; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd; + + $insert{"j${j}c31"} = sprintf "--:%s:1:-:1 %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd; + + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $yield = $c % 10 == 0 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + +LOAD_LOOP: +--:-:-:-:1 ISETP.GT.AND P0, PT, C, 2, PT; +20:-:-:-:1 IADD track0.CC, track0, 1x<32*36*2 * $dsize>; +--:-:-:-:1 ISETP.GT.AND P1, PT, C, 4, PT; +--:-:-:-:1 IADD C, C, -2; +[+ + our ($vsize, $dsize, $convert_in); + my %insert = ( + + j0c3 => "--:-:-:-:1 IADD.X track1, track1, RZ;\n", + + j0c0 => "--:-:-:-:1 LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n", + j0c2 => "--:-:-:-:1 LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n", + j0c18 => "--:-:1:-:1 LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n", + + j1c12 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n", + j1c14 => "--:-:-:-:1 \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n", + j1c16 => "--:-:1:-:1 \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n", + + $convert_in ? ( + + j0c1 => "02:-:-:-:1 F2F.F32.F16 T03, T01.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T02, T01.H0;\n", + j0c4 => "--:-:-:-:1 F2F.F32.F16 T01, T00.H1;\n" . + "--:-:2:-:1 F2F.F32.F16 T00, T00.H0;\n", + + j0c5 => "--:-:-:-:1 F2F.F32.F16 T13, T11.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T12, T11.H0;\n", + j0c6 => "--:-:-:-:1 F2F.F32.F16 T11, T10.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T10, T10.H0;\n", + + j0c7 => "--:-:-:-:1 F2F.F32.F16 T23, T21.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T22, T21.H0;\n", + j0c8 => "--:-:-:-:1 F2F.F32.F16 T21, T20.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T20, T20.H0;\n", + + j0c9 => "02:2:-:-:1 \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n", + j0c10 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n", + j0c11 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n", + + j0c13 => "02:-:-:-:1 \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n", + j0c14 => "10:-:-:-:1 \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n", + j0c15 => "20:-:2:-:1 \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n", + + j0c16 => "04:-:-:-:1 F2F.F32.F16 T33, T31.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T32, T31.H0;\n", + j0c17 => "--:-:-:-:1 F2F.F32.F16 T31, T30.H1;\n" . + "--:-:3:-:1 F2F.F32.F16 T30, T30.H0;\n", + + j0c19 => "--:-:-:-:1 F2F.F32.F16 T43, T41.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T42, T41.H0;\n", + j0c20 => "--:-:-:-:1 F2F.F32.F16 T41, T40.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T40, T40.H0;\n", + + j0c21 => "--:-:-:-:1 F2F.F32.F16 T53, T51.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T52, T51.H0;\n", + j0c22 => "--:-:-:-:1 F2F.F32.F16 T51, T50.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T50, T50.H0;\n", + + j0c23 => "04:3:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n", + j0c24 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n", + j0c25 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n", + + j0c27 => "04:-:-:-:1 \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n", + j0c28 => "10:-:-:-:1 \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n", + j0c29 => "20:-:3:-:1 \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n", + + j0c30 => "08:-:-:-:1 F2F.F32.F16 T63, T61.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T62, T61.H0;\n", + j0c31 => "--:-:-:-:1 F2F.F32.F16 T61, T60.H1;\n" . + "--:-:4:-:1 F2F.F32.F16 T60, T60.H0;\n", + + j1c0 => "--:-:-:-:1 F2F.F32.F16 T73, T71.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T72, T71.H0;\n", + j1c1 => "--:-:-:-:1 F2F.F32.F16 T71, T70.H1;\n" . + "--:-:5:-:1 F2F.F32.F16 T70, T70.H0;\n", + + j1c2 => "--:-:-:-:1 F2F.F32.F16 T83, T81.H1;\n" . + "--:-:-:-:1 F2F.F32.F16 T82, T81.H0;\n", + j1c3 => "--:-:-:-:1 F2F.F32.F16 T81, T80.H1;\n" . + "--:-:6:-:1 F2F.F32.F16 T80, T80.H0;\n", + + j1c4 => "08:4:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n", + j1c5 => "10:5:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n", + j1c6 => "20:6:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n", + + j1c8 => "08:-:-:-:1 \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n", + j1c9 => "10:-:-:-:1 \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n", + j1c10 => "20:6:4:-:1 \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n", + + ) : ( + + j0c6 => "02:-:-:-:1 STS.128 [writeS + 4x<0*32*4>], T0;\n", + j0c8 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n", + j0c10 => "--:2:-:-:1 \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n", + + j0c12 => "02:-:-:-:1 \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n", + j0c14 => "--:-:-:-:1 \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n", + j0c16 => "--:-:2:-:1 \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n", + + j0c20 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n", + j0c22 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n", + j0c24 => "--:3:-:-:1 \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n", + + j0c26 => "04:-:-:-:1 \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n", + j0c28 => "--:-:-:-:1 \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n", + j0c30 => "--:-:3:-:1 \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n", + + j1c0 => "08:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n", + j1c2 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n", + j1c4 => "--:4:-:-:1 \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n", + + j1c6 => "08:-:-:-:1 \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n", + j1c8 => "--:-:-:-:1 \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n", + j1c10 => "--:6:4:-:1 \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n", + ), + + j1c11 => "--:-:-:Y:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readFs, readFs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readIs, readIs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeS, writeS, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j1c31 => "--:-:-:Y:5 \@P0 BRA.U LOAD_LOOP;\n", + ); + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4]) + { + my ($x, $y) = @$xy; + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + my $out; + foreach my $j (0 .. 1) + { + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "01" : '--'; + + my $stall = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1; + + my $ctrl = "$wait:-:-:-:$stall"; + + $out .= sprintf "%s FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl, $x,$y, $j,$x, $j,$y, $x,$y, $ins; + } + } + return $out; ++] + +[- + our $trans1 = "0.343"; + our $trans2 = "0.700"; + our $trans3 = "0.490"; +-] + + diff --git a/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_common.sass b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_common.sass new file mode 100644 index 0000000..f2a06e6 --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_common.sass @@ -0,0 +1,807 @@ + +# Copyright 2016 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--:-:1:-:2 S2R Tid, SR_TID.X; + +--:-:-:-:1 MOV alpha16, param_alpha; + +01:-:-:-:1 LOP.AND Tid32_2, Tid, -32; +--:-:-:-:1 SHR.U32 Tid32_2, Tid32_2, 2; + +// readFs = ((tid & 16) >> 3) | (tid & 1) +--:-:-:-:1 LOP.AND Tid1, Tid, 1; +01:-:-:-:1 LOP.AND readFs, Tid, 16; +--:-:-:-:1 SHR.U32 readFs, readFs, 3; +--:-:-:-:1 IADD readFs, readFs, Tid1; + +// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2) +--:-:-:-:1 BFE.U32 readIs, Tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readIs, readIs, Tid32_2; +--:-:-:-:1 ISCADD readIs, readFs, readIs, 2; + +--:-:-:-:1 SHL readIs, readIs, 4; +--:-:-:-:1 SHL readFs, readFs, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:1 XMAD write16Cs, readFs, 1x<32*36>, readIs; + + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y0, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y2, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y2, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y2, alpha16; +--:-:-:-:4 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y1, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y3, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y3, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y3, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y4, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y6, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y6, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y6, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 FMUL shuffle16_x0y0, clx0y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y0, clx1y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y0, clx2y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x3y0, clx3y5, alpha16; +--:-:-:-:1 FMUL shuffle16_x0y1, clx0y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x1y1, clx1y7, alpha16; +--:-:-:-:1 FMUL shuffle16_x2y1, clx2y7, alpha16; +--:-:-:-:0 FMUL shuffle16_x3y1, clx3y7, alpha16; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0; +--:-:-:-:d STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 EXIT; + +COMPUTE_FINISH: + +--:-:1:-:2 S2R tid_128, SR_TID.X; + + +--:-:-:-:1 MOV alpha, param_alpha; + +01:-:-:-:1 IADD tid_128, tid_128, -128; + +--:-:-:-:1 ISETP.GE.AND P4, PT, tid_128, 256, PT; + +// readFs = ((tid & 8) >> 2) | (tid & 1) +--:-:-:-:1 LOP.AND Tid_1, tid_128, 1; +--:-:-:-:1 LOP.AND readFs2, tid_128, 8; +--:-:-:-:1 SHR.U32 readFs2, readFs2, 2; +--:-:-:-:1 IADD readFs2, readFs2, Tid_1; + +// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2) +--:-:-:-:1 LOP.AND tid_16, tid_128, -16; +--:-:-:-:1 SHR.U32 tid_16, tid_16, 1; +--:-:-:-:1 BFE.U32 readIs2, tid_128, 0x201; // 2 bits at position 1 +--:-:-:-:1 LOP.OR readIs2, readIs2, tid_16; +--:-:-:-:1 ISCADD readIs2, readFs2, readIs2, 2; + +--:-:-:-:1 ISCADD readIs2, readIs2, 4x<32*4>, 4; +--:-:-:-:1 SHL readFs2, readFs2, 3; + +// writeCs = readFs * 32*36 + readIs; +--:-:-:-:0 XMAD writeCs, readFs2, 1x<32*36>, readIs2; + + +--:-:-:-:5 @P4 BRA.U SKIP0; + +--:-:2:-:1 LDS idxX, [addr_idx_X]; +--:-:3:-:1 LDS idxY, [addr_idx_Y]; +--:-:1:-:1 S2R idxN, SR_CTAID.Z; +--:-:4:-:1 LDS idxK, [addr_idx_K]; + + +--:-:-:-:1 LOP.AND tid_31, tid_128, 31; +--:-:-:-:1 SHR.U32 tid_32, tid_128, 5; +--:-:-:-:1 SHR.U32 tid_64, tid_128, 6; + +[+ + our $bsum; return $bsum ? q{ +03:-:-:-:1 XMAD bsum_offset, idxX, param_gridN, idxN; +04:-:-:-:1 XMAD.LO2C bsum_offset, idxY, param_gridQN, bsum_offset; + } : ''; ++] + +--:-:-:-:1 MOV32I one, 1.0; + +// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16 +--:-:-:-:1 XMAD readCs, tid_32, 1x<32*36>, tid_31; +--:-:-:-:1 ISCADD readCs, tid_64, readCs, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// n = idxN*32 + tid & maskN +--:-:-:-:1 LOP.AND n, tid_31, param_maskN; +01:-:-:-:1 ISCADD n, idxN, n, 5; + +// Superblock offset +// idxX <<= shiftX +// idxX <<= shiftY +02:-:-:-:1 SHL idxX, idxX, param_shiftX; +04:-:-:-:1 SHL idxY, idxY, param_shiftY; + +// Get this threads offset within the superblock +--:-:-:-:1 BFE.U32 q, tid_31, param_superX; +--:-:-:-:1 BFE.U32 p, tid_31, param_superY; +--:-:-:-:1 ISCADD q, q, idxX, 2; +--:-:-:-:1 ISCADD p, p, idxY, 2; + +// k = idxK*32 + tid_32<<1 +--:-:-:-:1 SHL tid_32, tid_32, 1; +08:-:-:-:1 ISCADD k, idxK, tid_32, 5; + +// Out = k*PQN + p*QN + q*N + n +--:-:-:-:1 XMAD offsetO, q, param_N, n; +--:-:-:-:1 XMAD.LO2C offsetO, p, param_QN, offsetO; +--:-:-:-:1 XMAD.LO2C offsetO, k, param_PQN, offsetO; + +--:-:-:-:1 IADD z1, q, 1; +--:-:-:-:1 IADD z2, q, 2; +--:-:-:-:1 IADD z3, q, 3; + +--:-:-:-:1 ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op +--:-:-:-:1 ISETP.LT.AND P6, PT, n, param_N, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, q, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_Q, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_Q, P5; +--:-:-:-:1 ISETP.GE.AND P0, PT, q, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; +--:-:-:-:1 P2R mask_q, PR, RZ, 0x0f; + +--:-:-:-:1 IADD z1, p, 1; +--:-:-:-:1 IADD z2, p, 2; +--:-:-:-:1 IADD z3, p, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, p, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, z1, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, z2, param_P, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, z3, param_P, P6; +--:-:-:-:1 ISETP.GE.AND P0, PT, p, RZ, P0; +--:-:-:-:1 ISETP.GE.AND P1, PT, z1, RZ, P1; +--:-:-:-:1 ISETP.GE.AND P2, PT, z2, RZ, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, z3, RZ, P3; + +--:-:-:-:1 SEL preds, mask_q, RZ, P0; +--:-:-:-:1 @P1 BFI preds, mask_q, 0x404, preds; +--:-:-:-:1 @P2 BFI preds, mask_q, 0x408, preds; +--:-:-:-:1 @P3 BFI preds, mask_q, 0x40c, preds; + +--:-:-:-:1 ISETP.EQ.AND P6, PT, tid_31, RZ, PT; + + +SKIP0: + + +--:-:-:-:1 FMUL shuffle_x0y0, ccx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y0, alpha; +--:-:-:-:1 FMUL shuffle_x7y0, ccx7y0, alpha; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y2, alpha; +--:-:-:-:1 FMUL shuffle_x3y1, ccx3y2, alpha; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y2, alpha; +--:-:-:-:1 FMUL shuffle_x7y1, ccx7y2, alpha; + +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP1; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 1; +--:-:-:-:1 IADD offsetO, offsetO, param_PQN; + +SKIP1: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y1, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y1, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y1, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP2; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 15; +--:-:-:-:1 IADD offsetO, offsetO, param_PQN15; + +SKIP2: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y4, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y4, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y6, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP3; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +--:-:-:-:1 IADD k, k, 1; +--:-:-:-:1 IADD offsetO, offsetO, param_PQN; + +SKIP3: + +--:-:-:-:0 FMUL shuffle_x0y0, ccx0y5, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 FMUL shuffle_x1y0, ccx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, ccx2y5, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, ccx3y5, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, ccx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, ccx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, ccx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, ccx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, ccx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, ccx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, ccx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, ccx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, ccx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, ccx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, ccx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, ccx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1; +--:-:-:-:d STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 @P4 BRA.U SKIP4; +--:-:-:-:5 CAL OUTPUT_TRANSFORM; +SKIP4: + +--:-:-:-:5 EXIT; + +OUTPUT_TRANSFORM: + + +11:-:-:-:1 ISETP.LT.AND P5, PT, k, param_K, PT; +[+ + our $bias; + return $bias ? q{ +--:-:-:-:1 LEA Sum0.CC, k, param_S[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, k, param_S[1], RZ, 2; + +--:-:-:-:1 @!P5 MOV bias, RZ; +--:-:5:-:1 @P5 LDG.E.CI bias, [Sum]; + } : ''; ++] + + +[+ + my $out; + foreach my $i (0 .. 2) + { + foreach my $j (0 .. 5) + { + my $b = $i + 1; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + + +[+ + my $out; our ($trans1, $trans2, $trans3); + foreach my $i (0 .. 2) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ + +$w:-:-:-:1 FADD t0$i, m1$i, m2$i; +--:-:-:-:1 FADD t1$i, m1$i, -m2$i; +--:-:-:-:1 FADD t2$i, m3$i, -m4$i; +--:-:-:-:1 FADD t3$i, m3$i, m4$i; +--:-:-:-:1 FADD w0$i, t0$i, m0$i; +--:-:-:-:1 FMUL32I w3$i, t1$i, $trans1; +--:-:-:-:1 FMUL32I w1$i, t1$i, $trans2; +--:-:-:-:1 FMUL32I temp, t0$i, $trans3; +--:-:-:-:1 FFMA w3$i, t2$i, 3.375, w3$i; +--:-:-:-:1 FFMA w1$i, t2$i, 1.500, w1$i; +--:-:-:-:1 FFMA w2$i, t3$i, 2.250, temp; +--:-:-:-:1 FADD w0$i, w0$i, t3$i; +--:-:-:-:1 FADD w3$i, w3$i, m5$i; + + }; + } + foreach my $i (3 .. 5) + { + foreach my $j (0 .. 5) + { + my $b = $i + 1; + $out .= "--:-:$b:-:1 LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n"; + } + } + return $out; ++] + + + +[+ + my $out; our ($trans1, $trans2, $trans3); + + foreach my $i (3 .. 5) + { + my $w = sprintf "%02x", 1 << $i; + $out .= qq{ + +$w:-:-:-:1 FADD t0$i, m1$i, m2$i; +--:-:-:-:1 FADD t1$i, m1$i, -m2$i; +--:-:-:-:1 FADD t2$i, m3$i, -m4$i; +--:-:-:-:1 FADD t3$i, m3$i, m4$i; +--:-:-:-:1 FADD w0$i, t0$i, m0$i; +--:-:-:-:1 FMUL32I w3$i, t1$i, $trans1; +--:-:-:-:1 FMUL32I w1$i, t1$i, $trans2; +--:-:-:-:1 FMUL32I temp, t0$i, $trans3; +--:-:-:-:1 FFMA w3$i, t2$i, 3.375, w3$i; +--:-:-:-:1 FFMA w1$i, t2$i, 1.500, w1$i; +--:-:-:-:1 FFMA w2$i, t3$i, 2.250, temp; +--:-:-:-:1 FADD w0$i, w0$i, t3$i; +--:-:-:-:1 FADD w3$i, w3$i, m5$i; + + }; + } + return $out; ++] +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + + + +[+ + my $out; + our ($convert_out, $bias, $relu, $trans1, $trans2, $trans3); + foreach my $i (0 .. 3) + { + $out .= qq{ +--:-:-:-:1 FADD r${i}0, w${i}1, w${i}2; +--:-:-:-:1 FADD r${i}1, w${i}1, -w${i}2; +--:-:-:-:1 FADD r${i}2, w${i}3, -w${i}4; +--:-:-:-:1 FADD r${i}3, w${i}3, w${i}4; +--:-:-:-:1 FADD s${i}0, r${i}0, w${i}0; +--:-:-:-:1 FMUL32I s${i}3, r${i}1, $trans1; +--:-:-:-:1 FMUL32I s${i}1, r${i}1, $trans2; +--:-:-:-:1 FMUL32I temp, r${i}0, $trans3; +--:-:-:-:1 FFMA s${i}3, r${i}2, 3.375, s${i}3; +--:-:-:-:1 FFMA s${i}1, r${i}2, 1.500, s${i}1; +--:-:-:-:1 FFMA s${i}2, r${i}3, 2.250, temp; +--:-:-:-:1 FADD s${i}0, s${i}0, r${i}3; +--:-:-:-:1 FADD s${i}3, s${i}3, w${i}5; + }; + if ($bias) + { + $out .= qq{ +10:-:-:-:1 FADD s${i}0, s${i}0, bias; +--:-:-:-:1 FADD s${i}1, s${i}1, bias; +--:-:-:-:1 FADD s${i}2, s${i}2, bias; +--:-:-:-:1 FADD s${i}3, s${i}3, bias;}; + } + if ($relu) + { + $out .= qq{ +--:-:-:-:1 FMNMX s${i}0, s${i}0, RZ, !PT; +--:-:-:-:1 FMNMX s${i}1, s${i}1, RZ, !PT; +--:-:-:-:1 FMNMX s${i}2, s${i}2, RZ, !PT; +--:-:-:-:1 FMNMX s${i}3, s${i}3, RZ, !PT;}; + } + } + return $out; ++] + + +[+ + our $prelu; my $out; + if ($prelu) + { + foreach my $i (0 .. 3) + { + $out .= qq{ +// maximum(x, 0) + beta * minimum(0, x) +--:-:-:-:1 FMNMX b00, s${i}0, RZ, !PT; +--:-:-:-:1 FMNMX b01, s${i}1, RZ, !PT; +--:-:-:-:1 FMNMX b02, s${i}2, RZ, !PT; +--:-:-:-:1 FMNMX b03, s${i}3, RZ, !PT; + +--:-:-:-:1 FMNMX b10, s${i}0, RZ, PT; +--:-:-:-:1 FMNMX b11, s${i}1, RZ, PT; +--:-:-:-:1 FMNMX b12, s${i}2, RZ, PT; +--:-:-:-:1 FMNMX b13, s${i}3, RZ, PT; + +--:-:-:-:1 FFMA s${i}0, b10, param_beta, b00; +--:-:-:-:1 FFMA s${i}1, b11, param_beta, b01; +--:-:-:-:1 FFMA s${i}2, b12, param_beta, b02; +--:-:-:-:1 FFMA s${i}3, b13, param_beta, b03; + }; + } + } + return $out; ++] + +[+ + our ($beta, $brelu, $bprelu, $dtype, $dsize, $dshift, $convert_out, $Q, $N); + my $out; + if ($beta || $brelu || $bprelu) + { + my $preds = $beta ? q{ +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + } : ''; + + $out .= qq{ + +--:-:-:-:1 LEA Out0.CC, offsetO, param_X[0], $dshift; +--:-:-:-:1 LEA.HI.X Out1, offsetO, param_X[1], RZ, $dshift; + +--:-:-:-:1 \@P0 LDG.E.CG.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>]; +--:-:-:-:1 \@P1 LDG.E.CG.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CG.$dtype b02, [Out + ${dsize}x<0*$Q*$N + 2*$N>]; +--:-:1:-:1 \@P3 LDG.E.CG.$dtype b03, [Out + ${dsize}x<0*$Q*$N + 3*$N>]; +--:-:-:-:1 \@!P0 MOV b00, RZ; +--:-:-:-:1 \@!P1 MOV b01, RZ; +--:-:-:-:1 \@!P2 MOV b02, RZ; +--:-:-:-:1 \@!P3 MOV b03, RZ; +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + +--:-:-:-:1 \@P0 LDG.E.CG.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>]; +--:-:-:-:1 \@P1 LDG.E.CG.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CG.$dtype b12, [Out + ${dsize}x<1*$Q*$N + 2*$N>]; +--:-:2:-:1 \@P3 LDG.E.CG.$dtype b13, [Out + ${dsize}x<1*$Q*$N + 3*$N>]; +--:-:-:-:1 \@!P0 MOV b10, RZ; +--:-:-:-:1 \@!P1 MOV b11, RZ; +--:-:-:-:1 \@!P2 MOV b12, RZ; +--:-:-:-:1 \@!P3 MOV b13, RZ; +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + +--:-:-:-:1 \@P0 LDG.E.CG.$dtype b20, [Out + ${dsize}x<2*$Q*$N + 0*$N>]; +--:-:-:-:1 \@P1 LDG.E.CG.$dtype b21, [Out + ${dsize}x<2*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CG.$dtype b22, [Out + ${dsize}x<2*$Q*$N + 2*$N>]; +--:-:3:-:1 \@P3 LDG.E.CG.$dtype b23, [Out + ${dsize}x<2*$Q*$N + 3*$N>]; +--:-:-:-:1 \@!P0 MOV b20, RZ; +--:-:-:-:1 \@!P1 MOV b21, RZ; +--:-:-:-:1 \@!P2 MOV b22, RZ; +--:-:-:-:1 \@!P3 MOV b23, RZ; +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.L.U64 preds, preds, 12, preds; + +--:-:-:-:1 \@P0 LDG.E.CG.$dtype b30, [Out + ${dsize}x<3*$Q*$N + 0*$N>]; +--:-:-:-:1 \@P1 LDG.E.CG.$dtype b31, [Out + ${dsize}x<3*$Q*$N + 1*$N>]; +--:-:-:-:1 \@P2 LDG.E.CG.$dtype b32, [Out + ${dsize}x<3*$Q*$N + 2*$N>]; +--:-:4:-:1 \@P3 LDG.E.CG.$dtype b33, [Out + ${dsize}x<3*$Q*$N + 3*$N>]; +--:-:-:-:1 \@!P0 MOV b30, RZ; +--:-:-:-:1 \@!P1 MOV b31, RZ; +--:-:-:-:1 \@!P2 MOV b32, RZ; +--:-:-:-:1 \@!P3 MOV b33, RZ;$preds +}; + + if ($convert_out) + { + $out .= q{ +01:-:-:-:1 F2F.F32.F16 b00, b00; +--:-:-:-:1 F2F.F32.F16 b01, b01; +--:-:-:-:1 F2F.F32.F16 b02, b02; +--:-:1:-:1 F2F.F32.F16 b03, b03; +02:-:-:-:1 F2F.F32.F16 b10, b10; +--:-:-:-:1 F2F.F32.F16 b11, b11; +--:-:-:-:1 F2F.F32.F16 b12, b12; +--:-:2:-:1 F2F.F32.F16 b13, b13; +04:-:-:-:1 F2F.F32.F16 b20, b20; +--:-:-:-:1 F2F.F32.F16 b21, b21; +--:-:-:-:1 F2F.F32.F16 b22, b22; +--:-:3:-:1 F2F.F32.F16 b23, b23; +08:-:-:-:1 F2F.F32.F16 b30, b30; +--:-:-:-:1 F2F.F32.F16 b31, b31; +--:-:-:-:1 F2F.F32.F16 b32, b32; +--:-:4:-:1 F2F.F32.F16 b33, b33;}; + } + } + return $out; ++] + + +[+ + our $beta; return $beta ? q{ +01:-:-:-:1 FFMA s00, b00, param_beta, s00; +--:-:-:-:1 FFMA s01, b01, param_beta, s01; +--:-:-:-:1 FFMA s02, b02, param_beta, s02; +--:-:-:-:1 FFMA s03, b03, param_beta, s03; +02:-:-:-:1 FFMA s10, b10, param_beta, s10; +--:-:-:-:1 FFMA s11, b11, param_beta, s11; +--:-:-:-:1 FFMA s12, b12, param_beta, s12; +--:-:-:-:1 FFMA s13, b13, param_beta, s13; +04:-:-:-:1 FFMA s20, b20, param_beta, s20; +--:-:-:-:1 FFMA s21, b21, param_beta, s21; +--:-:-:-:1 FFMA s22, b22, param_beta, s22; +--:-:-:-:1 FFMA s23, b23, param_beta, s23; +08:-:-:-:1 FFMA s30, b30, param_beta, s30; +--:-:-:-:1 FFMA s31, b31, param_beta, s31; +--:-:-:-:1 FFMA s32, b32, param_beta, s32; +--:-:-:-:1 FFMA s33, b33, param_beta, s33;} : ''; ++] +[+ + our ($brelu, $bprelu); my $out; + if ($brelu || $bprelu) + { + foreach my $i (0 .. 3) + { + my $w = sprintf "%02x", 1 << $i; + $out .= $brelu ? qq{ +//delta *= (x > 0) +$w:-:-:-:1 FSETP.GT.AND P0, PT, b${i}0, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b${i}1, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b${i}2, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b${i}3, RZ, PT; +--:-:-:-:1 \@!P0 MOV s${i}0, RZ; +--:-:-:-:1 \@!P1 MOV s${i}1, RZ; +--:-:-:-:1 \@!P2 MOV s${i}2, RZ; +--:-:-:-:1 \@!P3 MOV s${i}3, RZ; + } : qq{ +//delta *= ((x > 0) + slope * (x < 0)) +$w:-:-:-:1 FSETP.GT.AND P0, PT, b${i}0, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b${i}1, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b${i}2, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b${i}3, RZ, PT; +--:-:-:-:1 SEL xx0, one, RZ, P0; +--:-:-:-:1 SEL xx1, one, RZ, P1; +--:-:-:-:1 SEL xx2, one, RZ, P2; +--:-:-:-:1 SEL xx3, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b${i}0, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b${i}1, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b${i}2, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b${i}3, RZ, PT; +--:-:-:-:1 SEL b${i}0, one, RZ, P0; +--:-:-:-:1 SEL b${i}1, one, RZ, P1; +--:-:-:-:1 SEL b${i}2, one, RZ, P2; +--:-:-:-:1 SEL b${i}3, one, RZ, P3; +--:-:-:-:1 FFMA b${i}0, b${i}0, param_beta, xx0; +--:-:-:-:1 FFMA b${i}1, b${i}1, param_beta, xx1; +--:-:-:-:1 FFMA b${i}2, b${i}2, param_beta, xx2; +--:-:-:-:1 FFMA b${i}3, b${i}3, param_beta, xx3; +--:-:-:-:1 FMUL s${i}0, s${i}0, b${i}0; +--:-:-:-:1 FMUL s${i}1, s${i}1, b${i}1; +--:-:-:-:1 FMUL s${i}2, s${i}2, b${i}2; +--:-:-:-:1 FMUL s${i}3, s${i}3, b${i}3; + }; + } + $out .= q{ +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:5 @!P5 R2P PR, RZ, 0x0f; +--:-:-:-:5 SHF.R.U64 preds, preds, 4, preds; +}; + } + return $out; ++] + +[+ + our $bsum; my $out; + if ($bsum) + { + $out = q{ + +--:-:-:-:1 XMAD.LO2C bias, k, param_gridPQN, bsum_offset; +--:-:-:-:1 LEA Sum0.CC, bias, param_S[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, bias, param_S[1], RZ, 2; +--:-:-:-:1 MOV sum0, RZ; +--:-:-:-:1 MOV sum1, RZ; +--:-:-:-:1 MOV sum2, RZ; +--:-:-:-:1 MOV sum3, RZ;}; + foreach my $i (0 .. 3) + { + my ($dir, $amt) = $i == 2 ? ('L','12') : ('R','4'); + $out .= qq{ +--:-:-:-:1 \@P0 FADD sum0, sum0, s${i}0; +--:-:-:-:1 \@P1 FADD sum1, sum1, s${i}1; +--:-:-:-:1 \@P2 FADD sum2, sum2, s${i}2; +--:-:-:-:1 \@P3 FADD sum3, sum3, s${i}3; +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.$dir.U64 preds, preds, $amt, preds;}; + } + $out .= q{ +--:-:-:-:1 FADD sum0, sum0, sum1; +--:-:-:-:1 FADD sum2, sum2, sum3; +--:-:-:-:1 FADD sum0, sum0, sum2; +}; + } + return $out; ++] +[+ + our $convert_out; return $convert_out ? q{ +--:-:-:-:1 F2F.F16.F32 s00, s00; +--:-:-:-:1 F2F.F16.F32 s01, s01; +--:-:-:-:1 F2F.F16.F32 s02, s02; +--:-:1:-:1 F2F.F16.F32 s03, s03; +--:-:-:-:1 F2F.F16.F32 s10, s10; +--:-:-:-:1 F2F.F16.F32 s11, s11; +--:-:-:-:1 F2F.F16.F32 s12, s12; +--:-:2:-:1 F2F.F16.F32 s13, s13; +--:-:-:-:1 F2F.F16.F32 s20, s20; +--:-:-:-:1 F2F.F16.F32 s21, s21; +--:-:-:-:1 F2F.F16.F32 s22, s22; +--:-:3:-:1 F2F.F16.F32 s23, s23; +--:-:-:-:1 F2F.F16.F32 s30, s30; +--:-:-:-:1 F2F.F16.F32 s31, s31; +--:-:-:-:1 F2F.F16.F32 s32, s32; +--:-:4:-:1 F2F.F16.F32 s33, s33;} : ''; ++] + +[+ + our ($bsum, $dtype, $dsize, $dshift, $Q, $N); + return $bsum ? qq{ +--:-:-:Y:6 LEA Out0.CC, offsetO, param_O[0], $dshift; +--:-:-:-:0 LEA.HI.X Out1, offsetO, param_O[1], RZ, $dshift; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 1, 0x1f; +01:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03; +--:-:-:-:2 \@P5 R2P PR, preds, 0x0f; +--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f; + +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:-:-:0 SHF.R.U64 preds, preds, 4, preds; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 2, 0x1f; + +02:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13; +--:-:-:-:2 \@P5 R2P PR, preds, 0x0f; +--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f; + +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:-:-:0 SHF.R.U64 preds, preds, 4, preds; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 4, 0x1f; + +04:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23; +--:-:-:-:2 \@P5 R2P PR, preds, 0x0f; +--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f; + +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:-:-:0 SHF.L.U64 preds, preds, 12, preds; +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 8, 0x1f; + +08:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32; +--:1:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33; + +10:-:-:-:4 FADD sum0, sum1, sum0; +--:-:-:-:0 PSETP.AND.AND P5, PT, P5, P6, PT; // k < K && tid31 == 0 +--:-:5:-:2 SHFL.BFLY PT, sum1, sum0, 16, 0x1f; +10:-:-:-:2 FADD sum0, sum1, sum0; +--:5:-:-:1 \@P5 STG.E.CG [Sum], sum0; + } : qq{ + + +--:-:-:-:1 LEA Out0.CC, offsetO, param_O[0], $dshift; +--:-:-:-:1 LEA.HI.X Out1, offsetO, param_O[1], RZ, $dshift; + +01:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + +02:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.R.U64 preds, preds, 4, preds; + +04:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22; +--:-:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23; + +--:-:-:-:1 \@P5 R2P PR, preds, 0x0f; +--:-:-:-:1 \@!P5 R2P PR, RZ, 0x0f; +--:-:-:-:1 SHF.L.U64 preds, preds, 12, preds; + +08:-:-:-:1 \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30; +--:-:-:-:1 \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31; +--:-:-:-:1 \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32; +--:1:-:-:1 \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33; + + + }; ++] + +--:-:-:-:5 RET; diff --git a/Kernel/Convolution/Pascal/xconv_xprop_common.sass b/Kernel/Convolution/Pascal/xconv_xprop_common.sass new file mode 100644 index 0000000..110dc4d --- /dev/null +++ b/Kernel/Convolution/Pascal/xconv_xprop_common.sass @@ -0,0 +1,841 @@ + + +[- + # Kernel Options: + our ($beta, $bias, $relu, $prelu, $brelu, $bprelu, $bsum); + + # set externally + our ($prefix, $prop, $shareI, $shareF, $stepI, $stepF, $remapI, $remapF); + + our $addr_shift = $prefix eq 's' ? 2 : 1; + our $half = $prefix eq 'h'; + + sub params + { + return <<'EOF'; + param_Sum[0] : c[0x0][0x140] + param_Sum[1] : c[0x0][0x144] + param_X[0] : c[0x0][0x148] + param_X[1] : c[0x0][0x14c] + param_O[0] : c[0x0][0x150] + param_O[1] : c[0x0][0x154] + param_I[0] : c[0x0][0x158] + param_I[1] : c[0x0][0x15c] + param_F[0] : c[0x0][0x160] + param_F[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_flags : c[0x0][0x170] + param_N : c[0x0][0x174] + param_K : c[0x0][0x178] + param_D : c[0x0][0x17c] + param_H : c[0x0][0x180] + param_W : c[0x0][0x184] + param_WN : c[0x0][0x188] + param_HWN : c[0x0][0x18c] + param_DHWN : c[0x0][0x190] + param_C : c[0x0][0x194] + param_KRST : c[0x0][0x198] + param_RST : c[0x0][0x19c] + param_RS : c[0x0][0x1a0] + param_T : c[0x0][0x1a4] + param_R : c[0x0][0x1a8] + param_S : c[0x0][0x1ac] + param_magic_RS : c[0x0][0x1b0] + param_shift_RS : c[0x0][0x1b4] + param_magic_S : c[0x0][0x1b8] + param_shift_S : c[0x0][0x1bc] + param_pad_d : c[0x0][0x1c0] + param_pad_h : c[0x0][0x1c4] + param_pad_w : c[0x0][0x1c8] + param_str_d : c[0x0][0x1cc] + param_str_h : c[0x0][0x1d0] + param_str_w : c[0x0][0x1d4] + param_dil_d : c[0x0][0x1d8] + param_dil_h : c[0x0][0x1dc] + param_dil_w : c[0x0][0x1e0] + param_P2 : c[0x0][0x1e4] + param_Q : c[0x0][0x1e8] + param_PQk : c[0x0][0x1ec] + param_Qk : c[0x0][0x1f0] + param_k : c[0x0][0x1f4] + param_magic_PQk : c[0x0][0x1f8] + param_shift_PQk : c[0x0][0x1fc] + param_magic_Qk : c[0x0][0x200] + param_shift_Qk : c[0x0][0x204] + param_magic_k : c[0x0][0x208] + param_shift_k : c[0x0][0x20c] + param_QN : c[0x0][0x210] + param_PQN : c[0x0][0x214] + param_MPQN : c[0x0][0x218] + param_gridN : c[0x0][0x21c] + param_gridQN : c[0x0][0x220] + param_gridPQN : c[0x0][0x224] + param_gridMPQN : c[0x0][0x228] + param_magic_str_d : c[0x0][0x22c] + param_shift_str_d : c[0x0][0x230] + param_magic_str_h : c[0x0][0x234] + param_shift_str_h : c[0x0][0x238] + param_magic_str_w : c[0x0][0x23c] + param_shift_str_w : c[0x0][0x240] +EOF + } + + sub get_mpqk + { + return <<'EOF'; +// idx_M = idx_MPQk / blk_PQk +--:-:-:-:1 MOV magic_PQk, param_magic_PQk; +--:-:-:-:1 ISETP.NE.AND P1, PT, magic_PQk, 1, PT; +02:-:-:-:1 @P1 XMAD div1, idx_MPQk, magic_PQk, RZ; +--:-:-:-:1 @P1 XMAD div2, idx_MPQk, magic_PQk.H1, RZ; +--:-:-:-:1 @P1 XMAD div3, idx_MPQk.H1, magic_PQk.H1, RZ; +--:-:-:-:1 @P1 XMAD.CHI div1, idx_MPQk.H1, magic_PQk, div1; +--:-:-:-:1 @P1 IADD3.RS idx_M, div1, div2, div3; +--:-:-:-:1 @P1 SHR.U32 idx_M, idx_M, param_shift_PQk; +--:-:-:-:1 @!P1 SHR.U32 idx_M, idx_MPQk, param_shift_PQk; + +// idx_PQk = idx_PQk % blk_Qk +--:-:-:-:1 IADD neg_PQk, RZ, -param_PQk; +--:-:-:-:1 XMAD.LO2 idx_PQk, neg_PQk, idx_M, idx_MPQk; + +// idx_P2 = idx_PQk / blk_Qk +--:-:-:-:1 MOV magic_Qk, param_magic_Qk; +--:-:-:-:1 ISETP.NE.AND P2, PT, magic_Qk, 1, PT; +--:-:-:-:1 @P2 XMAD div1, idx_PQk, magic_Qk, RZ; +--:-:-:-:1 @P2 XMAD div2, idx_PQk, magic_Qk.H1, RZ; +--:-:-:-:1 @P2 XMAD div3, idx_PQk.H1, magic_Qk.H1, RZ; +--:-:-:-:1 @P2 XMAD.CHI div1, idx_PQk.H1, magic_Qk, div1; +--:-:-:-:1 @P2 IADD3.RS idx_P2, div1, div2, div3; +--:-:-:-:1 @P2 SHR.U32 idx_P2, idx_P2, param_shift_Qk; +--:-:-:-:1 @!P2 SHR.U32 idx_P2, idx_PQk, param_shift_Qk; + +// idx_Qk = idx_PQk % blk_Qk +--:-:-:-:1 IADD neg_Qk, RZ, -param_Qk; +--:-:-:-:1 XMAD.LO2 idx_Qk, neg_Qk, idx_P2, idx_PQk; + +// idx_Q2 = idx_Qk / k +--:-:-:-:1 XMAD.LO2C idx_Q2, idx_Qk, param_magic_k, RZ; +--:-:-:-:1 SHR.U32 idx_Q2, idx_Q2, param_shift_k; +// idx_k = idx_Qk % k +--:-:-:-:1 IADD neg_k, RZ, -param_k; +--:-:-:-:1 XMAD.S16.U16 idx_k, neg_k, idx_Q2, idx_Qk; + +// idx_K = idx_K * blk_k + idx_k +04:-:-:-:1 XMAD idx_K, idx_K, param_k, idx_k; + +// Implement a square wave block id remapping (for all but last row (if odd number of rows)) +// idx_P = idx_P2 * 2 +// idx_Q = idx_Q2 +// if idx_P2 != gridP2: +// idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1) +// idx_Q = idx_Q2 >> 1 +--:-:-:-:1 ISETP.NE.AND P1, PT, idx_P2, param_P2, PT; +--:-:-:-:1 SHL idx_P, idx_P2, 1; +--:-:-:-:1 @P1 LOP.AND q1, idx_Q2, 1; +--:-:-:-:1 @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1 +--:-:-:-:1 @P1 LOP.XOR q1, q1, q2; +--:-:-:-:1 @P1 IADD idx_P, idx_P, q1; +--:-:-:-:1 @P1 SHR.U32 idx_Q, idx_Q2, 1; +--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2; + +// Scan backwards on odd rows +// if idx_P2 & 1: +// idx_Q = Q - idx_Q - 1 +--:-:-:-:1 LOP.AND.NZ P2, RZ, idx_P2, 1; +--:-:-:-:1 MOV negOne, -1; +--:-:-:-:1 @P2 IADD3 idx_Q, -idx_Q, param_Q, negOne; + +EOF + } + + sub load_zeros + { + return "--:-:-:-:1 STS.128 [addr_zero], RZ;\n" . + join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + } + + sub begin_lut + { + return <<'EOF'; +--:-:-:-:5 @P0 BRA.U END_SETUP; + + +--:-:-:-:1 STS.128 [addr_mpqk], mpqk; + +--:-:-:-:1 MOV rst, tid; +--:-:-:-:1 MOV lutStore2, RZ; +--:-:-:-:1 MOV lutSize, RZ; +--:-:-:-:1 MOV warp_count, 32; + +--:-:-:-:1 IADD mask_shr, -tid, 32; +--:-:-:-:1 SHR.U32 dep_thd_mask, negOne, mask_shr; +EOF + } + + sub end_lut + { + return sprintf <<'EOF', $addr_shift; + +// Get a mask of all valid slices in the warp +--:-:-:-:1 VOTE.ANY ballot, PT, P1; +// Count the total valid slices +--:-:2:-:1 POPC warp_slices, ballot; +// Prepare lutStore for this and next loop +--:-:-:-:1 @P1 MOV lutStore, lutStore2; +02:-:-:-:1 ISCADD lutStore2, warp_slices, lutStore2, 3; +// Count all the valid slices below this threadid +--:-:-:-:1 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot; +--:-:3:-:1 @P1 POPC dep_thd_cnt, dep_thd_bits; +// use the rst increment to space the barrier sync +--:-:-:-:1 IADD rst, rst, 32; +// Update the lutStore address from this count +04:-:-:-:1 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3; +// Store both slice offsets in the lut +--:1:-:-:1 @P1 STS.64 [lutStore + addr_lut], sliceIF; + +// Keep track of the total size of the lut +--:-:-:-:1 IADD lutSize, lutSize, warp_slices; + + +--:-:-:-:5 @P0 BRA.U LUT_LOOP; + +// Share the lut size with the other warp +--:1:-:-:2 STS [addr_szLut], lutSize; + +END_SETUP: + +01:-:-:-:5 BAR.SYNC 0; + +// Grab the caclulated lut size and get it's reciprical +// Get the total reduction depth +--:-:1:-:2 LDS lutSize, [addr_szLut]; +01:-:-:-:0 XMAD endCRST, lutSize, param_C, RZ; +--:-:1:-:2 I2F.F32.S32 lutSizeRcp, lutSize; +01:-:1:-:1 MUFU.RCP lutSizeRcp, lutSizeRcp; + + +// lutSize != 0 +--:-:-:-:1 LOP.AND.NZ P0, RZ, lutSize, -1; +// posCRST = endCRST - tidY - 1 +--:-:-:-:1 IADD3 posCRST, endCRST, -1, -tidY; +// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch. +// If it is a multiple of 8 then make a full 8 line fetch. +--:-:-:-:1 LOP.AND.Z P1, partial, endCRST, 7; +--:-:-:-:1 @P1 MOV partial, 8; +// channel = posCRST / lutSize +// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it +--:-:2:-:1 I2F.F32.S32 posCRSTf, posCRST; +03:-:-:-:1 FMUL channel, posCRSTf, lutSizeRcp; +--:-:-:-:1 FFMA channel, channel, 5.9604644775390625e-08, channel; +--:-:2:-:1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +02:-:-:-:1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST; +--:-:-:-:1 SHL lutOffset, lutOffset, 3; +// P1 = tidY < partial && +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY, partial, P0; +// offsetIC = channel * DHWN +// offsetFC = channel * K +--:-:-:-:1 XMAD.LO2C offsetIc, channel, param_DHWN, RZ; +--:-:-:-:1 XMAD offsetFc, channel, param_KRST, RZ; +// posCRST -= partial +--:-:-:-:1 IADD posCRST, posCRST, -partial; +--:-:1:-:2 @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut]; + + +// trackI = offsetIN + offsetIC + sliceI + param_I +// trackF = offsetFK + offsetFC + sliceF + param_F +01:-:-:-:1 @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF; +--:-:-:-:5 @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI; +--:-:-:-:6 @P1 LEA trackF0.CC, offsetF, param_F[0], %1$s; +--:-:-:-:1 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, %1$s; +--:-:-:-:6 @P1 LEA trackI0.CC, offsetI, param_I[0], %1$s; +--:-:-:-:0 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, %1$s; +EOF + } + + sub fprop_lut + { + return begin_lut() . <<'EOF' . end_lut(); +// mt = m * w - pad_d +// pr = p * u - pad_h +// qs = q * v - pad_w +--:-:-:-:1 XMAD qs, q, param_str_w, RZ; +--:-:-:-:1 XMAD pr, p, param_str_h, RZ; +--:-:-:-:1 XMAD mt, m, param_str_d, RZ; +--:-:-:-:1 IADD qs, qs, -param_pad_w; +--:-:-:-:1 IADD pr, pr, -param_pad_h; +--:-:-:-:1 IADD mt, mt, -param_pad_d; + + +LUT_LOOP: + + +// warp synchronous loop while warp_count < RST +--:-:-:-:1 ISETP.LT.AND P0, PT, warp_count, param_RST, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, rst, param_RST, PT; + +--:-:-:-:1 IADD warp_count, warp_count, 32; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = qs + (s * dil_w) +// y = pr + (r * dil_h) +// z = mt + (t * dil_d) +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 ISETP.GE.AND P4, PT, x, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P5, PT, y, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P6, PT, z, RZ, P6; +--:-:-:-:1 ISETP.LT.AND P4, PT, x, param_W, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, y, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, z, param_D, P6; +--:-:-:-:1 PSETP.AND.AND P1, PT, P4, P5, P6; + + +// sliceI = z*HWN + y*WN + x*N +01:-:-:-:1 XMAD sliceI, x, param_N, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y, param_WN, sliceI; +--:-:-:-:1 XMAD.LO2C sliceI, z, param_HWN, sliceI; +// sliceF = rst * K +--:-:-:-:1 XMAD sliceF, rst, param_K, RZ; + +EOF + } + + sub bprop_lut + { + return begin_lut() . <<'EOF' . end_lut(); +--:-:-:-:1 MOV str_d, param_str_d; +--:-:-:-:1 MOV str_h, param_str_h; +--:-:-:-:1 MOV str_w, param_str_w; +// qs = q - pad_w +// pr = p - pad_h +// mt = m - pad_d +--:-:-:-:1 IADD qs, q, -param_pad_w; +--:-:-:-:1 IADD pr, p, -param_pad_h; +--:-:-:-:1 IADD mt, m, -param_pad_d; + + +LUT_LOOP: + + +// warp synchronous loop while warp_count < RST +--:-:-:-:1 ISETP.LT.AND P0, PT, warp_count, param_RST, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, rst, param_RST, PT; +--:-:-:-:1 IADD warp_count, warp_count, 32; +// t = rst / RS +// rs = rst % RS +--:-:-:-:1 XMAD.LO2C t, rst, param_magic_RS, RZ; +--:-:-:-:1 SHR.U32 t, t, param_shift_RS; +--:-:-:-:1 XMAD rs, t, param_RS, RZ; +--:-:-:-:1 IADD rs, -rs, rst; +// r = rs / S +// s = rs % S +--:-:-:-:1 XMAD.LO2C r, rs, param_magic_S, RZ; +--:-:-:-:1 SHR.U32 r, r, param_shift_S; +--:-:-:-:1 XMAD s, r, param_S, RZ; +--:-:-:-:1 IADD s, -s, rs; +// x = qs + (s * dil_w) +// y = pr + (r * dil_h) +// z = mt + (t * dil_d) +--:-:-:-:1 XMAD x, s, param_dil_w, qs; +--:-:-:-:1 XMAD y, r, param_dil_h, pr; +--:-:-:-:1 XMAD z, t, param_dil_d, mt; +--:-:-:-:1 ISETP.GE.AND P4, PT, x, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P5, PT, y, RZ, PT; +--:-:-:-:1 ISETP.GE.AND P6, PT, z, RZ, P6; +// x_prime = x / str_w +// x = x % str_w +--:-:-:-:1 XMAD x_prime, x, param_magic_str_w, RZ; +--:-:-:-:1 SHR.U32 x_prime, x_prime, param_shift_str_w; +--:-:-:-:1 VMAD.U16.U16 x, -x_prime, str_w, x; +// y_prime = y / str_h +// y = y % str_h +--:-:-:-:1 XMAD y_prime, y, param_magic_str_h, RZ; +--:-:-:-:1 SHR.U32 y_prime, y_prime, param_shift_str_h; +--:-:-:-:1 VMAD.U16.U16 y, -y_prime, str_h, y; +// z_prime = z / str_d +// z = z % str_d +--:-:-:-:1 XMAD z_prime, z, param_magic_str_d, RZ; +--:-:-:-:1 SHR.U32 z_prime, z_prime, param_shift_str_d; +--:-:-:-:1 VMAD.U16.U16 z, -z_prime, str_d, z; + +--:-:-:-:1 ISETP.EQ.AND P4, PT, x, RZ, P4; +--:-:-:-:1 ISETP.EQ.AND P5, PT, y, RZ, P5; +--:-:-:-:1 ISETP.EQ.AND P6, PT, z, RZ, P6; +--:-:-:-:1 ISETP.LT.AND P4, PT, x_prime, param_W, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, y_prime, param_H, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, z_prime, param_D, P6; +--:-:-:-:1 PSETP.AND.AND P1, PT, P4, P5, P6; + +// sliceI = z_prime*HWN + y_prime*WN + x_prime*N +01:-:-:-:1 XMAD sliceI, x_prime, param_N, RZ; +--:-:-:-:1 XMAD.LO2C sliceI, y_prime, param_WN, sliceI; +--:-:-:-:1 XMAD.LO2C sliceI, z_prime, param_HWN, sliceI; +// sliceF = rst_prime * K +01:-:-:-:1 XMAD sliceF, rst, param_K, RZ; +EOF + } + + sub load_lut + { + return $prop eq 'f' ? fprop_lut() : bprop_lut(); + } + + sub loop_setup + { + my $swap; + if ($shareI == $shareF) + { + $swap = <<'EOF'; +--:-:-:-:0 LOP.XOR writeS, writeS, 4x; +EOF + } + else + { + $swap = <<'EOF'; +--:-:-:-:1 IADD writeIs, writeIs, swapBuf; +--:-:-:-:1 IADD writeFs, writeFs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; +EOF + } + return sprintf <<'EOF', $shareI, $shareF, $stepI, $stepF, $addr_shift, $swap; + +--:-:-:-:0 ISETP.GE.AND P1, PT, posCRST, RZ, PT; +--:-:2:-:1 I2F.F32.S32 posCRSTf, posCRST; + +01:-:-:-:5 BAR.SYNC 0; +%6$s + +--:-:-:-:1 LDS.U.128 j0Ix0, [readIs + 4x<0*%1$-3s + 00>]; +--:-:-:-:1 LDS.U.128 j0Fy0, [readFs + 4x<0*%2$-3s + 00>]; +--:-:-:-:1 LDS.U.128 j0Ix4, [readIs + 4x<0*%1$-3s + %3$s>]; +--:-:1:-:2 LDS.U.128 j0Fy4, [readFs + 4x<0*%2$-3s + %4$s>]; + + +// channel = posCRST / lutSize +02:-:-:-:1 @P1 FMUL channel, posCRSTf, lutSizeRcp; +--:-:-:-:1 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel; +--:-:2:-:1 @P1 F2I.S32.F32.TRUNC channel, channel; +// lutOffset = (posCRST % lutSize) * 8 +02:-:-:-:1 @P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST; +--:-:-:-:1 @P1 SHL lutOffset, lutOffset, 3; +// offsetIC = channel * DHWN +// offsetFC = channel * K +--:-:-:-:1 @P1 XMAD.LO2C offsetIc, channel, param_DHWN, RZ; +--:-:-:-:1 @P1 XMAD offsetFc, channel, param_KRST, RZ; + +--:-:-:-:1 IADD posCRST, posCRST, -8; +--:-:2:-:2 @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut]; + + +// trackI = offsetIN + offsetIC + sliceI + param_I +// trackF = offsetFK + offsetFC + sliceF + param_F +02:-:-:-:1 @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF; +--:-:-:-:5 @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI; +--:-:-:-:6 @P1 LEA trackF0.CC, offsetF, param_F[0], %5$s; +--:-:-:-:1 @P1 LEA.HI.X trackF1, offsetF, param_F[1], RZ, %5$s; +--:-:-:-:6 @P1 LEA trackI0.CC, offsetI, param_I[0], %5$s; +--:-:-:-:0 @P1 LEA.HI.X trackI1, offsetI, param_I[1], RZ, %5$s; +EOF + } + + sub main_loop + { + our %insert; + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx0, [readIs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareI; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dFy0, [readFs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareF; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dIx4, [readIs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareI, $stepI; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dFy4, [readFs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareF, $stepF; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + } + + + sub output_setup + { + my ($tidOX, $warp_shift, $bsum_shift) = @_; + my $out; + + $out .= qq{ +02:-:-:-:1 SHR.U32 bsum_offset, tidOX, $bsum_shift; +04:-:-:-:1 ISCADD bsum_offset, idx_N, bsum_offset, $warp_shift; +01:-:-:-:1 XMAD bsum_offset, idx_Q, param_gridN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_P, param_gridQN, bsum_offset; +--:-:-:-:1 XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset; + +--:-:-:-:1 LOP.AND.Z P5, RZ, tidOX, $tidOX; + } if $bsum; + + $out .= qq{ +// out_offset = m*PQN + p*QN + q*N + n +01:-:-:-:1 XMAD out_offset, q, param_N, n; +--:-:-:-:1 XMAD.LO2C out_offset, p, param_QN, out_offset; +--:-:-:-:1 XMAD.LO2C out_offset, m, param_PQN, out_offset; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV32I one, 1.0; + +--:-:-:-:1 ISETP.EQ.AND P2, PT, RZ, param_flags, PT; // no output +--:-:-:-:1 ISETP.LT.AND P0, PT, n, param_N, P2; + }; + + $out .= $half ? q{ +--:-:-:-:1 ISETP.LT.AND P1, PT, n, param_N, P2; + } : qq{ +--:-:-:-:1 IADD n, n, $stepI; +--:-:-:-:1 ISETP.LT.AND P1, PT, n, param_N, P2; + }; + return $out; + } + + sub output + { + my $out = q{ +--:-:-:-:5 BAR.SYNC 0; + }; + + foreach my $y (0..7) + { + my $incK = $y == 4 && !$remapF ? $stepF-3 : 1; + my $stepK = $y ? "\n--:-:-:-:1 IADD k, k, $incK;" : ""; + + $out .= qq{$stepK +--:-:-:-:1 FMUL cs0, cx0y$y, alpha; +--:-:-:-:1 FMUL cs1, cx1y$y, alpha; +--:-:-:-:1 FMUL cs2, cx2y$y, alpha; +--:-:-:-:1 FMUL cs3, cx3y$y, alpha; +--:-:-:-:1 FMUL cs4, cx4y$y, alpha; +--:-:-:-:1 FMUL cs5, cx5y$y, alpha; +--:-:-:-:1 FMUL cs6, cx6y$y, alpha; +--:-:-:-:0 FMUL cs7, cx7y$y, alpha; +--:-:-:-:5 CAL STORE_O; + }; + } + $out .= q{ + +--:-:-:-:5 EXIT; + +STORE_O: + + +30:-:-:-:1 XMAD offset, k, param_MPQN, out_offset; +--:-:-:-:1 XMAD.PSL offset, k, param_MPQN.H1, offset; +--:-:-:-:1 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n < N +--:-:-:-:1 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n < N + }; + + if ($beta || $brelu || $bprelu) + { + $out .= qq{ +--:-:-:-:1 LEA Out0.CC, offset, param_X[0], $addr_shift; +--:-:-:-:1 LEA.HI.X Out1, offset, param_X[1], RZ, $addr_shift; + }; + $out .= $half ? q{ +--:-:5:-:2 @P2 LDG.E.128 b0, [Out]; + } : q{ +--:-:5:-:1 @P2 LDG.E.128 b0, [Out + 4x<00>]; +--:-:6:-:1 @P3 LDG.E.128 b4, [Out + 4x<$stepI>]; + }; + } + + $out .= q{ +--:-:-:-:1 LEA Sum0.CC, k, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, k, param_Sum[1], RZ, 2; + +--:-:6:-:1 @P2 LDG.E.CI b0, [Sum]; +--:-:-:-:1 @!P2 MOV b0, RZ; + } if $bias; + + $out .= q{ + +--:-:-:-:1 STS.128 [writeCs + 4x<00>], cs0; +--:-:-:-:1 STS.128 [writeCs + 4x<$remapI ? 4 : $stepI>], cs4; +--:-:1:-:1 @P2 LDS.U.128 out0, [readCs + 4x<00>]; +--:-:2:-:1 @P3 LDS.U.128 out4, [readCs + 4x<$half ? 4 : $stepI>]; + + + + }; + + $out .= q{ +21:-:-:-:1 FADD out0, out0, b0; +--:-:-:-:1 FADD out1, out1, b0; +--:-:-:-:1 FADD out2, out2, b0; +--:-:-:-:1 FADD out3, out3, b0; +02:-:-:-:1 FADD out4, out4, b0; +--:-:-:-:1 FADD out5, out5, b0; +--:-:-:-:1 FADD out6, out6, b0; +--:-:-:-:1 FADD out7, out7, b0; + } if $bias; + + $out .= q{ +01:-:-:-:1 FMNMX out0, out0, RZ, !PT; +--:-:-:-:1 FMNMX out1, out1, RZ, !PT; +--:-:-:-:1 FMNMX out2, out2, RZ, !PT; +--:-:-:-:1 FMNMX out3, out3, RZ, !PT; +02:-:-:-:1 FMNMX out4, out4, RZ, !PT; +--:-:-:-:1 FMNMX out5, out5, RZ, !PT; +--:-:-:-:1 FMNMX out6, out6, RZ, !PT; +--:-:-:-:1 FMNMX out7, out7, RZ, !PT; + } if $relu; + + $out .= q{ +// maximum(x, 0) + slope * minimum(0, x) +01:-:-:-:1 FMNMX b0, out0, RZ, !PT; +--:-:-:-:1 FMNMX b1, out1, RZ, !PT; +--:-:-:-:1 FMNMX b2, out2, RZ, !PT; +--:-:-:-:1 FMNMX b3, out3, RZ, !PT; +02:-:-:-:1 FMNMX b4, out4, RZ, !PT; +--:-:-:-:1 FMNMX b5, out5, RZ, !PT; +--:-:-:-:1 FMNMX b6, out6, RZ, !PT; +--:-:-:-:1 FMNMX b7, out7, RZ, !PT; + +--:-:-:-:1 FMNMX x0, out0, RZ, PT; +--:-:-:-:1 FMNMX x1, out1, RZ, PT; +--:-:-:-:1 FMNMX x2, out2, RZ, PT; +--:-:-:-:1 FMNMX x3, out3, RZ, PT; +--:-:-:-:1 FMNMX x4, out4, RZ, PT; +--:-:-:-:1 FMNMX x5, out5, RZ, PT; +--:-:-:-:1 FMNMX x6, out6, RZ, PT; +--:-:-:-:1 FMNMX x7, out7, RZ, PT; + +--:-:-:-:1 FFMA out0, x0, param_beta, b0; +--:-:-:-:1 FFMA out1, x1, param_beta, b1; +--:-:-:-:1 FFMA out2, x2, param_beta, b2; +--:-:-:-:1 FFMA out3, x3, param_beta, b3; +--:-:-:-:1 FFMA out4, x4, param_beta, b4; +--:-:-:-:1 FFMA out5, x5, param_beta, b5; +--:-:-:-:1 FFMA out6, x6, param_beta, b6; +--:-:-:-:1 FFMA out7, x7, param_beta, b7; + } if $prelu; + + $out .= q{ + + }; + + $out .= q{ +13:-:-:-:1 @P2 F2F.F32.F16 b7, b3.H1; +--:-:-:-:1 @P2 F2F.F32.F16 b6, b3.H0; +--:-:-:-:1 @P2 F2F.F32.F16 b5, b2.H1; +--:-:-:-:1 @P2 F2F.F32.F16 b4, b2.H0; +--:-:-:-:1 @P2 F2F.F32.F16 b3, b1.H1; +--:-:-:-:1 @P2 F2F.F32.F16 b2, b1.H0; +--:-:-:-:1 @P2 F2F.F32.F16 b1, b0.H1; +--:-:5:-:2 @P2 F2F.F32.F16 b0, b0.H0; + } if $half && ($beta || $brelu || $bprelu); + + $out .= q{ + + }; + + $out .= q{ +11:-:-:-:1 @P2 FFMA out0, b0, param_beta, out0; +--:-:-:-:1 @P2 FFMA out1, b1, param_beta, out1; +--:-:-:-:1 @P2 FFMA out2, b2, param_beta, out2; +--:-:-:-:1 @P2 FFMA out3, b3, param_beta, out3; +22:-:-:-:1 @P3 FFMA out4, b4, param_beta, out4; +--:-:-:-:1 @P3 FFMA out5, b5, param_beta, out5; +--:-:-:-:1 @P3 FFMA out6, b6, param_beta, out6; +--:-:-:-:1 @P3 FFMA out7, b7, param_beta, out7; + } if $beta; + + $out .= q{ +//delta *= (x > 0) +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +11:-:-:-:1 FSETP.GT.AND P0, PT, b0, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b1, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b2, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b3, RZ, PT; +--:-:-:-:1 @!P0 MOV out0, RZ; +--:-:-:-:1 @!P1 MOV out1, RZ; +--:-:-:-:1 @!P2 MOV out2, RZ; +--:-:-:-:1 @!P3 MOV out3, RZ; +22:-:-:-:1 FSETP.GT.AND P0, PT, b4, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b5, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b6, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b7, RZ, PT; +--:-:-:-:1 @!P0 MOV out4, RZ; +--:-:-:-:1 @!P1 MOV out5, RZ; +--:-:-:-:1 @!P2 MOV out6, RZ; +--:-:-:-:1 @!P3 MOV out7, RZ; +--:-:-:-:5 R2P PR, preds, 0x0f; + } if $brelu; + + $out .= q{ +//delta *= ((x > 0) + slope * (x < 0)) +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; +11:-:-:-:1 FSETP.GT.AND P0, PT, b0, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b1, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b2, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b3, RZ, PT; +--:-:-:-:1 SEL x0, one, RZ, P0; +--:-:-:-:1 SEL x1, one, RZ, P1; +--:-:-:-:1 SEL x2, one, RZ, P2; +--:-:-:-:1 SEL x3, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b0, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b1, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b2, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b3, RZ, PT; +--:-:-:-:1 SEL b0, one, RZ, P0; +--:-:-:-:1 SEL b1, one, RZ, P1; +--:-:-:-:1 SEL b2, one, RZ, P2; +--:-:-:-:1 SEL b3, one, RZ, P3; +--:-:-:-:1 FFMA b0, b0, param_beta, x0; +--:-:-:-:1 FFMA b1, b1, param_beta, x1; +--:-:-:-:1 FFMA b2, b2, param_beta, x2; +--:-:-:-:1 FFMA b3, b3, param_beta, x3; +--:-:-:-:1 FMUL out0, out0, b0; +--:-:-:-:1 FMUL out1, out1, b1; +--:-:-:-:1 FMUL out2, out2, b2; +--:-:-:-:1 FMUL out3, out3, b3; +22:-:-:-:1 FSETP.GT.AND P0, PT, b4, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P1, PT, b5, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P2, PT, b6, RZ, PT; +--:-:-:-:1 FSETP.GT.AND P3, PT, b7, RZ, PT; +--:-:-:-:1 SEL x4, one, RZ, P0; +--:-:-:-:1 SEL x5, one, RZ, P1; +--:-:-:-:1 SEL x6, one, RZ, P2; +--:-:-:-:1 SEL x7, one, RZ, P3; +--:-:-:-:1 FSETP.LT.AND P0, PT, b4, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, b5, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, b6, RZ, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, b7, RZ, PT; +--:-:-:-:1 SEL b4, one, RZ, P0; +--:-:-:-:1 SEL b5, one, RZ, P1; +--:-:-:-:1 SEL b6, one, RZ, P2; +--:-:-:-:1 SEL b7, one, RZ, P3; +--:-:-:-:1 R2P PR, preds, 0x0f; +--:-:-:-:1 FFMA b4, b4, param_beta, x4; +--:-:-:-:1 FFMA b5, b5, param_beta, x5; +--:-:-:-:1 FFMA b6, b6, param_beta, x6; +--:-:-:-:1 FFMA b7, b7, param_beta, x7; +--:-:-:-:1 FMUL out4, out4, b4; +--:-:-:-:1 FMUL out5, out5, b5; +--:-:-:-:1 FMUL out6, out6, b6; +--:-:-:-:1 FMUL out7, out7, b7; + } if $bprelu; + + $out .= q{ +--:-:-:-:1 @!P2 MOV sum0, RZ; +--:-:-:-:1 @!P3 MOV sum2, RZ; +01:-:-:-:1 @P2 FADD sum0, out0, out1; +--:-:-:-:1 @P2 FADD sum1, out2, out3; +02:-:-:-:1 @P3 FADD sum2, out4, out5; +--:-:-:-:1 @P3 FADD sum3, out6, out7; +--:-:-:-:1 @P2 FADD sum0, sum0, sum1; +--:-:-:-:1 @P3 FADD sum2, sum2, sum3; +--:-:-:-:1 FADD sum0, sum0, sum2; + } if $bsum; + + $out .= q{ + +01:-:-:-:1 @P2 F2F.F16.F32 out0, out0; +--:-:-:-:1 @P2 F2F.F16.F32 out1, out1; +--:-:-:-:1 @P2 F2F.F16.F32 out2, out2; +--:-:1:-:1 @P2 F2F.F16.F32 out3, out3; +02:-:-:-:1 @P2 F2F.F16.F32 out4, out4; +--:-:-:-:1 @P2 F2F.F16.F32 out5, out5; +--:-:-:-:1 @P2 F2F.F16.F32 out6, out6; +--:-:2:-:1 @P2 F2F.F16.F32 out7, out7; + + } if $half; + + $out .= q{ + + }; + + $out .= $half ? qq{ + +--:-:-:-:1 LEA Out0.CC, offset, param_O[0], $addr_shift; +--:-:-:-:1 LEA.HI.X Out1, offset, param_O[1], RZ, $addr_shift; + +01:-:-:-:1 \@P2 BFI c0, out1, 0x1010, out0; +--:-:-:-:1 \@P2 BFI c1, out3, 0x1010, out2; +02:-:-:-:1 \@P2 BFI c2, out5, 0x1010, out4; +--:-:-:-:1 \@P2 BFI c3, out7, 0x1010, out6; + +--:5:-:-:1 \@P2 STG.E.CG.128 [Out], c0; + + } : qq{ + +--:-:-:-:1 LEA Out0.CC, offset, param_O[0], $addr_shift; +--:-:-:-:1 LEA.HI.X Out1, offset, param_O[1], RZ, $addr_shift; + +01:-:-:-:1 \@P2 STG.E.CG.128 [Out + 4x<00>], out0; +02:5:-:-:1 \@P3 STG.E.CG.128 [Out + 4x<$stepI>], out4; + + }; + + $out .= q{ + +--:-:-:-:1 XMAD.LO2C offset, k, param_gridMPQN, bsum_offset; +--:-:-:-:1 LEA Sum0.CC, offset, param_Sum[0], 2; +--:-:-:-:1 LEA.HI.X Sum1, offset, param_Sum[1], RZ, 2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, k, param_K, P5; // k < K && tid31 == 0 + +--:-:2:-:2 SHFL.BFLY PT, sum1, sum0, 1, 0x1f; +02:-:-:-:4 FADD sum0, sum1, sum0; +--:-:2:-:2 SHFL.BFLY PT, sum1, sum0, 2, 0x1f; +02:-:-:-:4 FADD sum0, sum1, sum0; +--:-:2:-:2 SHFL.BFLY PT, sum1, sum0, 4, 0x1f; +02:-:-:-:2 FADD sum0, sum1, sum0; + +--:6:-:-:1 @P6 STG.E.CG [Sum], sum0; + + } if $bsum; + + $out .= q{ +--:-:-:-:5 RET; + }; + } + +-] diff --git a/Kernel/SGEMM/Kepler/Makefile b/Kernel/SGEMM/Kepler/Makefile new file mode 100644 index 0000000..9df39ec --- /dev/null +++ b/Kernel/SGEMM/Kepler/Makefile @@ -0,0 +1,19 @@ +BINS := sgemm_nn_128x128 sgemm_nt_128x128 sgemm_tn_128x128 \ + sgemm_nn_128x128_vec sgemm_tn_128x128_vec sgemm_nt_128x128_vec +TARGETS := $(addsuffix .cubin, $(BINS)) +TEMPLATES := $(addsuffix _template.cubin, $(BINS)) + +all: $(BINS) + +$(BINS): + nvcc -arch sm_35 -m 64 $@.cu -cubin -O3 -o $@_template.cubin + KeplerAs.pl -i $@.sass $@_template.cubin $@.cubin + +clean: + rm $(TARGETS) $(TEMPLATES) + +.PHONY: + all clean + +#utils +print-% : ; $(info $* is $(flavor $*) variable set to [$($*)]) @true diff --git a/Kernel/SGEMM/Kepler/README.md b/Kernel/SGEMM/Kepler/README.md new file mode 100644 index 0000000..82a5a4f --- /dev/null +++ b/Kernel/SGEMM/Kepler/README.md @@ -0,0 +1,3 @@ +# KeplerGEMM + +Faster GEMM diff --git a/Kernel/SGEMM/Kepler/sgemm_common_128x128.sass b/Kernel/SGEMM/Kepler/sgemm_common_128x128.sass new file mode 100644 index 0000000..a334224 --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_common_128x128.sass @@ -0,0 +1,378 @@ +# sgemm_common_128x128 + +//////////////////////////////////////////// +// debug +//-:-:-:-:00 MOV tmp_param0, param_C[0]; +//-:-:-:-:00 MOV tmp_param1, param_C[1]; +// +//-:-:-:-:00 MOV32I k, 0x3f8ccccd; +//-:-:-:-:00 ST.E [tmp_param0], k; +//-:-:-:-:00 EXIT; +///////////////////////////////////////// + +-:-:-:-:00 LDS.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS.128 j0Bx0, [readBs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +-:-:-:-:00 LDS.128 j0Bx4, [readBs + 4x<0*128 + 64>]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + + # cOrder + # register resue + + push @cOrder, [0,0]; + push @cOrder, [0,1]; + push @cOrder, [1,1]; + push @cOrder, [2,0]; + push @cOrder, [1,0]; + push @cOrder, [2,1]; + push @cOrder, [2,3]; + push @cOrder, [2,2]; + push @cOrder, [1,2]; + push @cOrder, [0,3]; + push @cOrder, [1,3]; + push @cOrder, [0,2]; + push @cOrder, [0,4]; + push @cOrder, [0,5]; + push @cOrder, [1,5]; + push @cOrder, [2,4]; + push @cOrder, [1,4]; + push @cOrder, [2,5]; + push @cOrder, [2,7]; + push @cOrder, [2,6]; + push @cOrder, [1,6]; + push @cOrder, [0,7]; + push @cOrder, [1,7]; + push @cOrder, [0,6]; + push @cOrder, [3,6]; + push @cOrder, [3,7]; + push @cOrder, [4,7]; + push @cOrder, [5,6]; + push @cOrder, [4,6]; + push @cOrder, [5,7]; + push @cOrder, [5,5]; + push @cOrder, [5,4]; + push @cOrder, [4,4]; + push @cOrder, [3,5]; + push @cOrder, [4,5]; + push @cOrder, [3,4]; + push @cOrder, [3,2]; + push @cOrder, [3,3]; + push @cOrder, [4,3]; + push @cOrder, [5,2]; + push @cOrder, [4,2]; + push @cOrder, [5,3]; + push @cOrder, [5,1]; + push @cOrder, [5,0]; + push @cOrder, [4,0]; + push @cOrder, [3,1]; + push @cOrder, [4,1]; + push @cOrder, [3,0]; + push @cOrder, [6,0]; + push @cOrder, [7,0]; + push @cOrder, [7,1]; + push @cOrder, [6,2]; + push @cOrder, [6,1]; + push @cOrder, [7,2]; + push @cOrder, [7,5]; + push @cOrder, [6,5]; + push @cOrder, [6,4]; + push @cOrder, [7,3]; + push @cOrder, [7,4]; + push @cOrder, [6,3]; + push @cOrder, [6,6]; + push @cOrder, [6,7]; + push @cOrder, [7,7]; + push @cOrder, [7,6]; + + my $out = join '', @top; + my $loopc = 0; + + foreach my $j (0 .. 7) + { + # $odd = 0, 1, 0, 1, 0, 1, 0, 1 + # $nOdd = 1, 0, 1, 0, 1, 0, 1, 0 + # $rsOffset = 1, 2, 3, 4, 5, 6, 7, 0 + # $rsPred = ' ', ' ', ' ', ' ', ' ', ' ', ' ', @P0 + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + #$insert{"j${j}c5"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + #$insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy2, [readAs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset; + #$insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + #$insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy6, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + #$insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + #$insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx2, [readBs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset; + #$insert{"j${j}c47"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + #$insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx6, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + + # to avoid conflict with the second FFMA + # 5 11 17 59 are bank 2 friendly, two empty and two reuse + # 23 29 35 41 are bank 3 friendly, two empty and two reuse + # LDS.64 throught is higher + $insert{"j${j}c5"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx2, [readBs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx6, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy2, [readAs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy6, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $ctrl = "-:-:-:-:00"; + + if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) { + $ins = "-:G:D:-:00 NOP;\n"; + } + + if ($c > 60 && !$ins){ + $ins = "-:-:D:-:07 NOP;\n"; + } + + # 04 and 05 are dual issued + if($ins) { + $ctrl = "-:-:D:-:04"; + } else { + if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){ + $ctrl = "-:-:D:-:04"; + } + else{ + $ctrl = "-:-:D:-:05"; + } + } + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x, $y, $odd, $x, $odd, $y, $x, $y, $ins; + $loopc = $loopc + 1; + } + } + return $out; + + + + +-:-:-:-:00 S2R blockA, SR_CTAID.Y; +-:-:-:-:00 S2R blockB, SR_CTAID.Z; +-:-:-:-:00 S2R blockZ, SR_CTAID.X; + +-:-:-:-:00 LOP.AND tid_31, tid, 31; +-:-:-:-:00 LOP.AND tid_96, tid, 96; +-:-:-:-:00 LOP.AND tid_128, tid, 128; + +// writeCs = readAs * 32 + readBs; +-:-:-:-:00 LOP.AND readAs, readAs, 0xfff; +-:-:-:-:00 LOP.AND readBs, readBs, 0xfff; +-:-:-:-:00 ISCADD writeCs, readAs, readBs, 5; + +// cx = tid_31 | (tid_128 >> 2); +-:-:-:-:00 SHR.U32 cx00, tid_128, 2; +-:-:-:-:00 LOP.OR cx00, tid_31, cx00; + +// readCs = ((tid_96 << 4) | cx) << 2; +-:-:-:-:00 SHL readCs, tid_96, 4; +-:-:-:-:00 LOP.OR readCs, readCs, cx00; +-:-:-:-:00 SHL readCs, readCs, 2; + +// cx += blockB*128; +-:-:-:-:00 ISCADD cx00, blockB, cx00, 7; +-:-:-:-:00 IADD cx64, cx00, 64; + +// cy = blockA*128 + (tid_96 >> 1) +-:-:-:-:00 SHR.U32 cy00, tid_96, 1; +-:-:-:-:00 ISCADD cy00, blockA, cy00, 7; + +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +-:-:-:-:00 MOV ldcz, RZ; +-:-:-:-:00 MOV ldc, param_ldc; +-:-:-:-:00 IMAD ci, ldc, cy00, cx00; +-:-:-:-:00 IMAD ci, ldcz, blockZ, ci; +-:-:-:-:00 MOV tmp_param0, param_C[0]; +-:-:-:-:00 MOV tmp_param1, param_C[1]; +-:-:-:-:00 SHL tmp_shl, ci, 2; +-:-:-:-:00 IADD C00y0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X C00y1, RZ, tmp_param1; + +// ldc1 = ldc (byte) +// ldc4 = 4ldc (byte) +// 1dc60 = 60ldc (byte) +-:-:-:-:00 SHL ldc1, ldc, 2; +-:-:-:-:00 SHL ldc4, ldc, 4; +-:-:-:-:00 ISCADD ldc60, ldc, -ldc4, 8; + +-:-:-:-:00 MOV alpha, param_alpha; +-:-:-:-:00 MOV beta, param_beta; + +// Apply beta +-:-:-:-:00 ISETP.NE.AND P6, PT, beta, RZ, PT; + +// interleave for high throughput +-:-:-:-:00 IADD C04y0.CC, C00y0, ldc4; +-:-:-:-:00 IADD cy04, cy00, 4; +-:-:-:-:00 IADD.X C04y1, C00y1, RZ; +-:-:-:-:00 IADD C08y0.CC, C04y0, ldc4; +-:-:-:-:00 IADD cy08, cy00, 8; +-:-:-:-:00 IADD.X C08y1, C04y1, RZ; +-:-:-:-:00 IADD C12y0.CC, C08y0, ldc4; +-:-:-:-:00 IADD cy12, cy00, 12; +-:-:-:-:00 IADD.X C12y1, C08y1, RZ; + +-:-:-:-:00 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "-:-:-:-:00 IADD C00y0.CC, C00y0, ldc60;\n" . + "-:-:-:-:00 IADD cy00, cy00, 60;\n" . + "-:-:-:-:00 IADD.X C00y1, C00y1, RZ;\n" . + "-:-:-:-:00 IADD C04y0.CC, C04y0, ldc60;\n" . + "-:-:-:-:00 IADD cy04, cy04, 60;\n" . + "-:-:-:-:00 IADD.X C04y1, C04y1, RZ;\n" . + "-:-:-:-:00 IADD C08y0.CC, C08y0, ldc60;\n" . + "-:-:-:-:00 IADD cy08, cy08, 60;\n" . + "-:-:-:-:00 IADD.X C08y1, C08y1, RZ;\n" . + "-:-:-:-:00 IADD C12y0.CC, C12y0, ldc60;\n" . + "-:-:-:-:00 IADD cy12, cy12, 60;\n" . + "-:-:-:-:00 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n" . + "-:-:-:-:00 FMUL c4, cx4y%d, alpha;\n" . + "-:-:-:-:00 FMUL c5, cx5y%d, alpha;\n" . + "-:-:-:-:00 FMUL c6, cx6y%d, alpha;\n" . + "-:-:-:-:00 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "-:-:-:-:00 CAL STORE_C;\n\n"; + } + return $out; + + + +-:-:-:-:00 EXIT; + +STORE_C: + +-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, P6; +-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P1, PT, cy00, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, cy04, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +-:-:-:-:00 @P0 LD.E d0, [C00y + 4x<00>]; +-:-:-:-:00 @P1 LD.E d1, [C00y + 4x<64>]; +-:-:-:-:00 @P2 LD.E d2, [C04y + 4x<00>]; +-:-:-:-:00 @P3 LD.E d3, [C04y + 4x<64>]; + +-:-:-:-:00 @!P0 MOV d0, RZ; +-:-:-:-:00 @!P1 MOV d1, RZ; +-:-:-:-:00 @!P2 MOV d2, RZ; +-:-:-:-:00 @!P3 MOV d3, RZ; + +-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, PT; +-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P1, PT, cy00, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, cy04, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P3, PT, cy04, param_m, P5; +-:-:-:-:00 IADD cy00, cy00, 1; +-:-:-:-:00 IADD cy04, cy04, 1; + +// beta != 0 +-:-:-:-:00 ISETP.NE.AND P6, PT, beta, RZ, PT; + +-:-:-:-:00 STS.128 [writeCs+4x<00>], c0; +-:-:-:-:00 STS.128 [writeCs+4x<64>], c4; +-:-:-:-:00 LDS c0, [readCs + 4x<0*128 + 00>]; +-:-:-:-:00 LDS c1, [readCs + 4x<0*128 + 64>]; +-:-:-:-:00 LDS c2, [readCs + 4x<1*128 + 00>]; +-:-:-:-:00 LDS c3, [readCs + 4x<1*128 + 64>]; + +-:-:-:-:00 @P6 FFMA c0, d0, beta, c0; +-:-:-:-:00 @P6 FFMA c1, d1, beta, c1; +-:-:-:-:00 @P6 FFMA c2, d2, beta, c2; +-:-:-:-:00 @P6 FFMA c3, d3, beta, c3; + +-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, P6; +-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +-:-:-:-:00 @P0 ST.E.CG [C00y0 + 4x<00>], c0; +-:-:-:-:00 @P1 ST.E.CG [C00y0 + 4x<64>], c1; +-:-:-:-:00 @P2 ST.E.CG [C04y0 + 4x<00>], c2; +-:-:-:-:00 @P3 ST.E.CG [C04y0 + 4x<64>], c3; + +-:-:-:-:00 ISETP.LT.AND P0, PT, cy08, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P1, PT, cy08, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, cy12, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +-:-:-:-:00 @P0 LD.E d0, [C08y0 + 4x<00>]; +-:-:-:-:00 @P1 LD.E d1, [C08y0 + 4x<64>]; +-:-:-:-:00 @P2 LD.E d2, [C12y0 + 4x<00>]; +-:-:-:-:00 @P3 LD.E d3, [C12y0 + 4x<64>]; +-:-:-:-:00 @!P0 MOV d0, RZ; +-:-:-:-:00 @!P1 MOV d1, RZ; +-:-:-:-:00 @!P2 MOV d2, RZ; +-:-:-:-:00 @!P3 MOV d3, RZ; + +-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, PT; +-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +-:-:-:-:00 ISETP.LT.AND P0, PT, cy08, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P1, PT, cy08, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, cy12, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +-:-:-:-:00 IADD C00y0.CC, C00y0, ldc1; +-:-:-:-:00 IADD cy08, cy08, 1; +-:-:-:-:00 IADD cy12, cy12, 1; +-:-:-:-:00 IADD.X C00y1, C00y1, RZ; +-:-:-:-:00 IADD C04y0.CC, C04y0, ldc1; +-:-:-:-:00 IADD.X C04y1, C04y1, RZ; + +-:-:-:-:00 LDS c0, [readCs + 4x<2*128 + 00>]; +-:-:-:-:00 LDS c1, [readCs + 4x<2*128 + 64>]; +-:-:-:-:00 LDS c2, [readCs + 4x<3*128 + 00>]; +-:-:-:-:00 LDS c3, [readCs + 4x<3*128 + 64>]; + +-:-:-:-:00 @P6 FFMA c0, d0, beta, c0; +-:-:-:-:00 @P6 FFMA c1, d1, beta, c1; +-:-:-:-:00 @P6 FFMA c2, d2, beta, c2; +-:-:-:-:00 @P6 FFMA c3, d3, beta, c3; + +-:-:-:-:00 @P0 ST.E.CG [C08y0 + 4x<00>], c0; +-:-:-:-:00 @P1 ST.E.CG [C08y0 + 4x<64>], c1; +-:-:-:-:00 @P2 ST.E.CG [C12y0 + 4x<00>], c2; +-:-:-:-:00 @P3 ST.E.CG [C12y0 + 4x<64>], c3; + +-:-:-:-:00 IADD C08y0.CC, C08y0, ldc1; +-:-:-:-:00 IADD.X C08y1, C08y1, RZ; +-:-:-:-:00 IADD C12y0.CC, C12y0, ldc1; +-:-:-:-:00 IADD.X C12y1, C12y1, RZ; + +-:-:-:-:00 RET; + diff --git a/Kernel/SGEMM/Kepler/sgemm_common_128x32.sass b/Kernel/SGEMM/Kepler/sgemm_common_128x32.sass new file mode 100644 index 0000000..6af763c --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_common_128x32.sass @@ -0,0 +1,220 @@ +# sgemm_common_128x32 + +-:-:-:-:00 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +-:-:-:-:00 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +-:-:-:-:00 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +-:-:-:-:00 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +-:-:-:-:00 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +-:-:-:-:00 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +-:-:-:-:00 MOV alpha, param_alpha; +-:-:-:-:00 MOV beta, param_beta; +-:-:-:-:00 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 32 + readBs; +-:-:-:-:00 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +-:-:-:-:00 IADD readBs, readBs, -4x; +-:-:-:-:00 @P0 IADD readAs, readAs, -swapBuf; +-:-:-:-:00 @P0 IADD readBs, readBs, -swapBuf; +-:-:-:-:00 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +-:-:-:-:00 LOP.AND tid31, tid, 31; +-:-:-:-:00 LOP.AND tid96, tid, 96; +-:-:-:-:00 ISCADD readCs, tid96, tid31, 2; +-:-:-:-:00 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +-:-:-:-:00 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +-:-:-:-:00 SHR.U32 cy00, tid96, 1; +-:-:-:-:00 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +-:-:-:-:00 MOV ldc, param_ldc; +-:-:-:-:00 MOV ldcz, param_ldcz; +-:-:-:-:00 XMAD.LO ci, ldc, cy00, cx, xmad_c; +-:-:-:-:00 XMAD.LO2 ci, ldcz, blkZ, ci; +-:-:-:-:00 LEA C00y0.CC, ci, param_C[0], 2; +-:-:-:-:00 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + +// Apply relu +-:-:-:-:00 LOP.AND.NZ P4, RZ, flags, 2; +// cx < n +-:-:-:-:00 ISETP.LT.AND P6, PT, cx, param_n, PT; +// beta != 0 +-:-:-:-:00 ISETP.NE.AND P5, PT, beta, RZ, P6; + +-:-:-:-:00 SHL ldc1, ldc, 2; +-:-:-:-:00 SHL ldc4, ldc, 4; +-:-:-:-:00 ISCADD ldc60, ldc, -ldc4, 8; + +-:-:-:-:00 IADD C04y0.CC, C00y0, ldc4; +-:-:-:-:00 MOV d0, RZ; +-:-:-:-:00 IADD cy04, cy00, 4; +-:-:-:-:00 IADD.X C04y1, C00y1, RZ; +-:-:-:-:00 IADD C08y0.CC, C04y0, ldc4; +-:-:-:-:00 MOV d1, RZ; +-:-:-:-:00 IADD cy08, cy00, 8; +-:-:-:-:00 IADD.X C08y1, C04y1, RZ; +-:-:-:-:00 IADD C12y0.CC, C08y0, ldc4; +-:-:-:-:00 MOV d2, RZ; +-:-:-:-:00 MOV d3, RZ; +-:-:-:-:00 IADD cy12, cy00, 12; +-:-:-:-:00 IADD.X C12y1, C08y1, RZ; + +-:-:-:-:00 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "-:-:-:-:00 IADD C00y0.CC, C00y0, ldc60;\n" . + "-:-:-:-:00 IADD cy00, cy00, 60;\n" . + "-:-:-:-:00 IADD.X C00y1, C00y1, RZ;\n" . + "-:-:-:-:00 IADD C04y0.CC, C04y0, ldc60;\n" . + "-:-:-:-:00 IADD cy04, cy04, 60;\n" . + "-:-:-:-:00 IADD.X C04y1, C04y1, RZ;\n" . + "-:-:-:-:00 IADD C08y0.CC, C08y0, ldc60;\n" . + "-:-:-:-:00 IADD cy08, cy08, 60;\n" . + "-:-:-:-:00 IADD.X C08y1, C08y1, RZ;\n" . + "-:-:-:-:00 IADD C12y0.CC, C12y0, ldc60;\n" . + "-:-:-:-:00 IADD cy12, cy12, 60;\n" . + "-:-:-:-:00 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" . + "-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" . + "-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" . + "-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "-:-:-:-:00 CAL STORE_C;\n\n"; + } + return $out; + + + +-:-:-:-:00 EXIT; + +STORE_C: + +-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P1, PT, cy04, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, cy08, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +-:-:-:-:00 @P0 LDG.E d0, [C00y]; +-:-:-:-:00 @P1 LDG.E d1, [C04y]; +-:-:-:-:00 @P2 LDG.E d2, [C08y]; +-:-:-:-:00 @P3 LDG.E d3, [C12y]; +-:-:-:-:00 @!P0 MOV d0, RZ; +-:-:-:-:00 @!P1 MOV d1, RZ; +-:-:-:-:00 @!P2 MOV d2, RZ; +-:-:-:-:00 @!P3 MOV d3, RZ; + +-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P6; +-:-:-:-:00 ISETP.LT.AND P1, PT, cy04, param_m, P6; +-:-:-:-:00 ISETP.LT.AND P2, PT, cy08, param_m, P6; +-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +-:-:-:-:00 IADD cy00, cy00, 1; +-:-:-:-:00 IADD cy04, cy04, 1; +-:-:-:-:00 IADD cy08, cy08, 1; +-:-:-:-:00 IADD cy12, cy12, 1; + +-:-:-:-:00 @P4 FMNMX c0, c0, RZ, !PT; +-:-:-:-:00 @P4 FMNMX c1, c1, RZ, !PT; +-:-:-:-:00 @P4 FMNMX c2, c2, RZ, !PT; +-:-:-:-:00 @P4 FMNMX c3, c3, RZ, !PT; + +-:-:-:-:00 STS.128 [writeCs], c0; +-:-:-:-:00 LDS c0, [readCs + 4x<0*32>]; +-:-:-:-:00 LDS c1, [readCs + 4x<1*32>]; +-:-:-:-:00 LDS c2, [readCs + 4x<2*32>]; +-:-:-:-:00 LDS c3, [readCs + 4x<3*32>]; + +-:-:-:-:00 @P5 FFMA c0, d0, beta, c0; +-:-:-:-:00 @P5 FFMA c1, d1, beta, c1; +-:-:-:-:00 @P5 FFMA c2, d2, beta, c2; +-:-:-:-:00 @P5 FFMA c3, d3, beta, c3; + +-:-:-:-:00 @P0 STG.E.CG [C00y], c0; +-:-:-:-:00 @P1 STG.E.CG [C04y], c1; +-:-:-:-:00 @P2 STG.E.CG [C08y], c2; +-:-:-:-:00 @P3 STG.E.CG [C12y], c3; + +-:-:-:-:00 IADD C00y0.CC, C00y0, ldc1; +-:-:-:-:00 IADD.X C00y1, C00y1, RZ; +-:-:-:-:00 IADD C04y0.CC, C04y0, ldc1; +-:-:-:-:00 IADD.X C04y1, C04y1, RZ; +-:-:-:-:00 IADD C08y0.CC, C08y0, ldc1; +-:-:-:-:00 IADD.X C08y1, C08y1, RZ; +-:-:-:-:00 IADD C12y0.CC, C12y0, ldc1; +-:-:-:-:00 IADD.X C12y1, C12y1, RZ; +-:-:-:-:00 RET; diff --git a/Kernel/SGEMM/Kepler/sgemm_nn_128x128.cu b/Kernel/SGEMM/Kepler/sgemm_nn_128x128.cu new file mode 100644 index 0000000..7a630c8 --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_nn_128x128.cu @@ -0,0 +1,25 @@ +extern "C" +__global__ void __launch_bounds__(256) sgemm_nn_128x128 +( + const float* param_A, + const float* param_B, + float* param_C, + float param_alpha, + float param_beta, + int param_lda, + int param_ldb8, + int param_ldc, + int param_m, + int param_n, + int param_k + ) { + __shared__ float share[128 * 8 * 4 + 32]; + + int tid = threadIdx.x; + + share[tid] = 1; + + __syncthreads(); + + param_C[tid] = share[255 - tid]; +} diff --git a/Kernel/SGEMM/Kepler/sgemm_nn_128x128.sass b/Kernel/SGEMM/Kepler/sgemm_nn_128x128.sass new file mode 100644 index 0000000..a18ae65 --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_nn_128x128.sass @@ -0,0 +1,311 @@ +# Kernel: sgemm_nn_128x128 + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_A[0] : c[0x0][0x140] + param_A[1] : c[0x0][0x144] + param_B[0] : c[0x0][0x148] + param_B[1] : c[0x0][0x14c] + param_C[0] : c[0x0][0x150] + param_C[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_lda : c[0x0][0x160] + param_ldb8 : c[0x0][0x164] + param_ldc : c[0x0][0x168] + param_m : c[0x0][0x16c] + param_n : c[0x0][0x170] + param_k : c[0x0][0x174] + + + + 64-95 ~ blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, xmad_ta, xmad_tb, tid31, tid128 + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + + 64-67 : j0Ay<0-3> + 68-71 : j0Bx<0-3> + 72-75 : j0Ay<4-7> + 76-79 : j0Bx<4-7> + 80-83 : j1Ay<0-3> + 84-87 : j1Bx<0-3> + 88-91 : j1Ay<4-7> + 92-95 : j1Bx<4-7> + + 96-103 : loadA<0-3>, loadB<0-3> + + 104-107 : trackA<0-1>, trackB<0-1> + + 112-121 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop + 122-127 ~ readAs, readBs, tid + 128-135 ~ tmp_data, tmp_shl, tmp_param0, tmp_param1 + 144-159 ~ k<1-3>, x<1-3> + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +// special to register +// tid = 0 : 255 +// blkA = 0 : M / 128 +// blkB = 0 : N / 128 +// blkZ = 0 +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkA, SR_CTAID.Y; +-:-:-:-:00 S2R blkB, SR_CTAID.Z; +-:-:-:-:00 S2R blkZ, SR_CTAID.X;// blkZ=1 + +-:-:-:-:00 MOV k, param_k; +-:-:-:-:00 MOV ldaz, RZ; +-:-:-:-:00 MOV ldbz, RZ; +-:-:-:-:00 MOV ldcz, RZ; +-:-:-:-:00 MOV lda, param_lda; +-:-:-:-:00 MOV ldb, param_ldb8; +-:-:-:-:00 SHR.U32 ldb, ldb, 5;// ldb is not byte +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15); + + +// tidAY = (tid & 1) << 2 +// tidAY = 0, 4 +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 SHL tidAY, tid1, 2; + +// tidAX = tid >> 1 +// tidAX = 0 : 1 : 128 +-:-:-:-:00 SHR.U32 tidAX, tid, 1; + +// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY) +// - +// - +// blkA - +// - +// - +// tidAX ---- trackA +// tidAY +-:-:-:-:00 ISCADD txa, blkA, tidAX, 7; +-:-:-:-:00 IMAD ta, lda, txa, tidAY; +-:-:-:-:00 IMAD ta, ldaz, blkZ, ta; +// TODO(keren): 0x2? +-:-:-:-:00 MOV tmp_param0, param_A[0]; +-:-:-:-:00 MOV tmp_param1, param_A[1]; +-:-:-:-:00 SHL tmp_shl, ta, 0x2; +-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1; + +-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) & 7 +// tidBX = 0 : 4 : 128 +// tidBY = 0 : 1 : 8 +-:-:-:-:00 LOP.AND tid31, tid, 31; +-:-:-:-:00 SHL tidBX, tid31, 2; +-:-:-:-:00 BFE.U32 tidBY, tid, 0x305; // 3 bits at position 5 + +// trackB += (blkB*128 + ldb*tidBY + tidBX) * 4 +// - +// - +// - +// - +// tidBY --------------- trackB +// blkB tidBX +-:-:-:-:00 ISCADD txb, blkB, tidBX, 7; +-:-:-:-:00 IMAD tb, ldb, tidBY, txb; +-:-:-:-:00 IMAD tb, ldbz, blkZ, tb; +// TODO(keren): 0x2? +-:-:-:-:00 MOV tmp_param0, param_B[0]; +-:-:-:-:00 MOV tmp_param1, param_B[1]; +-:-:-:-:00 SHL tmp_shl, tb, 0x2; +-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1; + +// TODO(keren): blkB * 128 + tidBX < param_n +-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeAs = 4 * (128 * tidAY + tidAX + 128 * 8 * 2) +// tidAX = 0 : 1 : 128 +// tidAY = 0, 4 +// ---------------- +// ---------------- tidAY 0, 4 +// ---------------- +// ---------------- +// ---- writeAs +// tidAX +-:-:-:-:00 ISCADD writeAs, tidAY, tidAX, 7; +-:-:-:-:00 ISCADD writeAs, writeAs, 4x<128*8*2>, 2; + +// writeBs = (128*tidBY + tidBX + 128 * 8 * 3) * 4 +// tidBX = 0 : 4 : 128 +// tidBY = 0 : 1 : 8 +// ---------------- +// ---------------- +// ---------------- +// ---------------- tidBY +// ---- writeBs +// tidBX +-:-:-:-:00 ISCADD writeBs, tidBY, tidBX, 7; +-:-:-:-:00 ISCADD writeBs, writeBs, 4x<128*8*3>, 2; + +// (keren): A allocate 128 * 8 elements +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +// [6][5][4][0] * 4 +// readAs = 0 : 1 : 64 +-:-:-:-:00 LOP.AND readAs, tid, 0x70; +-:-:-:-:00 SHR.U32 readAs, readAs, 3; +-:-:-:-:00 LOP.OR readAs, readAs, tid1; +-:-:-:-:00 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +// [7][3][2][1] * 4 * 4 + 4 * 128 * 8 +// readBs = 0 : 1 : 64 +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 SHR.U32 readBs, tid128, 4; +-:-:-:-:00 LOP.OR readBs, readBs, tid7; +-:-:-:-:00 ISCADD readBs, readBs, 4x<128*8>, 4; + +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + + REMAINDER: + + + return q{ + // doLoad0 = tidBY < k + -:-:-:-:00 IADD x1, txb, 1; + -:-:-:-:00 IADD x2, txb, 2; + -:-:-:-:00 IADD x3, txb, 3; + + -:-:-:-:00 ISETP.LT.AND P0, PT, tidBY, k, P6; + -:-:-:-:00 ISETP.LT.AND P1, PT, x1, param_n, P0; + -:-:-:-:00 ISETP.LT.AND P2, PT, x2, param_n, P0; + -:-:-:-:00 ISETP.LT.AND P3, PT, x3, param_n, P0; + + -:-:-:-:00 @P0 LD.E.CI loadB0, [trackB + 4x<0>]; + -:-:-:-:00 @P1 LD.E.CI loadB1, [trackB + 4x<1>]; + -:-:-:-:00 @P2 LD.E.CI loadB2, [trackB + 4x<2>]; + -:-:-:-:00 @P3 LD.E.CI loadB3, [trackB + 4x<3>]; + + -:-:-:-:00 @!P0 MOV loadB0, RZ; + -:-:-:-:00 @!P1 MOV loadB1, RZ; + -:-:-:-:00 @!P2 MOV loadB2, RZ; + -:-:-:-:00 @!P3 MOV loadB3, RZ; + + -:-:-:-:00 IADD k1, tidAY, 1; + -:-:-:-:00 IADD k2, tidAY, 2; + -:-:-:-:00 IADD k3, tidAY, 3; + + -:-:-:-:00 ISETP.LT.AND P0, PT, tidAY, k, P5; + -:-:-:-:00 ISETP.LT.AND P1, PT, k1, k, P5; + -:-:-:-:00 ISETP.LT.AND P2, PT, k2, k, P5; + -:-:-:-:00 ISETP.LT.AND P3, PT, k3, k, P5; + + -:-:-:-:00 @P0 LD.E.CI loadA0, [trackA + 4x<0>]; + -:-:-:-:00 @P1 LD.E.CI loadA1, [trackA + 4x<1>]; + -:-:-:-:00 @P2 LD.E.CI loadA2, [trackA + 4x<2>]; + -:-:-:-:00 @P3 LD.E.CI loadA3, [trackA + 4x<3>]; + + -:-:-:-:00 @!P0 MOV loadA0, RZ; + -:-:-:-:00 @!P1 MOV loadA1, RZ; + -:-:-:-:00 @!P2 MOV loadA2, RZ; + -:-:-:-:00 @!P3 MOV loadA3, RZ; + + // bDoRemainder = k > 8 + -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, PT; + + -:G:-:-:15 STS.128 [writeBs], loadB0; + + -:G:-:-:15 STS [writeAs + 4x<0*128>], loadA0; + -:G:-:-:15 STS [writeAs + 4x<1*128>], loadA1; + -:G:-:-:15 STS [writeAs + 4x<2*128>], loadA2; + -:G:-:-:15 STS [writeAs + 4x<3*128>], loadA3; + + -:-:-:-:00 IADD trackB0.CC, trackB0, param_ldb8; + -:-:-:-:00 IADD.X trackB1, trackB1, RZ; + + -:-:-:-:00 IADD trackA0.CC, trackA0, 4x<8>; + -:-:-:-:00 IADD.X trackA1, trackA1, RZ; + }; + + + +// double buffer +// readAs = readAs + 128 * 8 * 2 +// readBs = readBs + 128 * 8 * 2 +-:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>; +-:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>; +-:-:-:-:00 BAR.SYNC 0; +// writeAs = writeAs - 128 * 8 * 2 +// writeBs = writeBs - 128 * 8 * 2 +-:-:-:-:00 LOP.XOR writeAs, writeAs, 4x<128*8*2>; +-:-:-:-:00 LOP.XOR writeBs, writeBs, 4x<128*8*2>; + + + my $k_end = 24; + our %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c63 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + + # cannot use LDG because of the offset + j1c47 => "-:-:-:-:00 \@P3 LD.E.CI loadB0, [trackB + 4x<0>];\n", + j1c53 => "-:-:-:-:00 \@P3 LD.E.CI loadB1, [trackB + 4x<1>];\n", + j1c61 => "-:-:-:-:00 \@P3 LD.E.CI loadB2, [trackB + 4x<2>];\n", + j1c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB3, [trackB + 4x<3>];\n", + + j2c47 => "-:-:-:-:00 \@P2 LD.E.CI loadA0, [trackA + 4x<0>];\n", + j2c53 => "-:-:-:-:00 \@P2 LD.E.CI loadA1, [trackA + 4x<1>];\n", + j2c61 => "-:-:-:-:00 \@P2 LD.E.CI loadA2, [trackA + 4x<2>];\n", + j2c62 => "-:-:-:-:00 \@P2 LD.E.CI loadA3, [trackA + 4x<3>];\n", + + j3c47 => "-:-:D:S:02 \@P0 STS.128 [writeBs], loadB0;\n", + j3c53 => "-:-:-:-:00 IADD32I k, k, -8;\n", + + j4c47 => "-:-:-:-:00 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + j4c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j5c47 => "-:-:D:S:02 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n", + j5c53 => "-:-:D:S:02 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n", + j5c61 => "-:-:D:-:07 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n", + j5c62 => "-:-:D:-:07 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n", + + j6c47 => "-:-:-:-:00 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n", + j6c53 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n", + j6c63 => "T:-:D:S:00 BAR.SYNC 0;\n", + + j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n", + j7c53 => "-:-:-:-:00 \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n", + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n" . + "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + + diff --git a/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.cu b/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.cu new file mode 100644 index 0000000..3e262ba --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.cu @@ -0,0 +1,25 @@ +extern "C" +__global__ void __launch_bounds__(256) sgemm_nn_128x128_vec +( + const float* param_A, + const float* param_B, + float* param_C, + float param_alpha, + float param_beta, + int param_lda, + int param_ldb8, + int param_ldc, + int param_m, + int param_n, + int param_k + ) { + __shared__ float share[128 * 8 * 4 + 32]; + + int tid = threadIdx.x; + + share[tid] = 1; + + __syncthreads(); + + param_C[tid] = share[255 - tid]; +} diff --git a/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.sass b/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.sass new file mode 100644 index 0000000..723dadd --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.sass @@ -0,0 +1,260 @@ +# Kernel: sgemm_nn_128x128_vec + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_A[0] : c[0x0][0x140] + param_A[1] : c[0x0][0x144] + param_B[0] : c[0x0][0x148] + param_B[1] : c[0x0][0x14c] + param_C[0] : c[0x0][0x150] + param_C[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_lda : c[0x0][0x160] + param_ldb8 : c[0x0][0x164] + param_ldc : c[0x0][0x168] + param_m : c[0x0][0x16c] + param_n : c[0x0][0x170] + param_k : c[0x0][0x174] + + + + 64-91 ~ blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, txa, txb, tid31, tid128, tidBY, ta, tb, tmp_shl + 92-93 ~ tmp_param0, tmp_param1 + + 0-63 : czero<00-63> + + // avoid ffma single instruction bank conflict + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 : j0Ay<0-3> + 68-71 : j0Bx<0-3> + 72-75 : j0Ay<4-7> + 76-79 : j0Bx<4-7> + 80-83 : j1Ay<0-3> + 84-87 : j1Bx<0-3> + 88-91 : j1Ay<4-7> + 92-95 : j1Bx<4-7> + + 96-103 : loadA<0-3>, loadB<0-3> + + 104-107 : trackA<0-1>, trackB<0-1> + + 108-112 ~ writeAs, writeBs, k, k_and, tidAY + // to avoid lds bank conflict with ffma + 117 ~ readAs + 116 ~ readBs + 115 ~ tid + + 64-75 ~ ldc, ci, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-101 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +//special to register +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkA, SR_CTAID.Y; +-:-:-:-:00 S2R blkB, SR_CTAID.Z; +-:-:-:-:00 S2R blkZ, SR_CTAID.X;//blkZ=1 + +-:-:-:-:00 MOV k, param_k; +-:-:-:-:00 MOV ldaz, RZ; +-:-:-:-:00 MOV ldbz, RZ; +-:-:-:-:00 MOV ldcz, RZ; +-:-:-:-:00 MOV lda, param_lda; +-:-:-:-:00 MOV ldb, param_ldb8; +-:-:-:-:00 SHR.U32 ldb, ldb, 5;//ldb is not byte +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + join('', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15); + + +// tidAY = (tid & 1) << 2 +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 SHL tidAY, tid1, 2; + +// tidAX = tid >> 1 +-:-:-:-:00 SHR.U32 tidAX, tid, 1; + +// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY) +-:-:-:-:00 ISCADD txa, blkA, tidAX, 7; +-:-:-:-:00 IMAD ta, lda, txa, tidAY; +-:-:-:-:00 IMAD ta, ldaz, blkZ, ta; +// TODO(keren): 0x2? +-:-:-:-:00 MOV tmp_param0, param_A[0]; +-:-:-:-:00 MOV tmp_param1, param_A[1]; +-:-:-:-:00 SHL tmp_shl, ta, 0x2; +-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1; + +-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) & 7 +-:-:-:-:00 LOP.AND tid31, tid, 31; +-:-:-:-:00 SHL tidBX, tid31, 2; +-:-:-:-:00 BFE.U32 tidBY, tid, 0x305; // 3 bits at position 5 + +// trackB += (blkB*128 + ldb*tidBY + tidBX) * 4 +-:-:-:-:00 ISCADD txb, blkB, tidBX, 7; +-:-:-:-:00 IMAD tb, ldb, tidBY, txb; +-:-:-:-:00 IMAD tb, ldbz, blkZ, tb; +// TODO(keren): 0x2? +-:-:-:-:00 MOV tmp_param0, param_B[0]; +-:-:-:-:00 MOV tmp_param1, param_B[1]; +-:-:-:-:00 SHL tmp_shl, tb, 0x2; +-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1; + +// TODO(keren): blkB * 128 + tidBX < param_n +-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeAs = 4 * (128 * tidAY + tidAX) +-:-:-:-:00 ISCADD writeAs, tidAY, tidAX, 7; +-:-:-:-:00 ISCADD writeAs, writeAs, 4x<128*8*2>, 2; + +// writeBs = (128*tidBY + tidBX) * 4 +-:-:-:-:00 ISCADD writeBs, tidBY, tidBX, 7; +-:-:-:-:00 ISCADD writeBs, writeBs, 4x<128*8*3>, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +-:-:-:-:00 LOP.AND readAs, tid, 0x70; +-:-:-:-:00 SHR.U32 readAs, readAs, 3; +-:-:-:-:00 LOP.OR readAs, readAs, tid1; +-:-:-:-:00 SHL readAs, readAs, 4; + +// (keren): A allocate 128 * 8 elements +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 SHR.U32 readBs, tid128, 4; +-:-:-:-:00 LOP.OR readBs, readBs, tid7; +-:-:-:-:00 ISCADD readBs, readBs, 4x<128*8>, 4; + + REMAINDER: + + + return q{ + // k must be a multiple of 4 + // n must be a multiple of 4 + // - + // - + // - + // - + // tidBY --------------- trackB ---- loadB0 + // blkB tidBX + -:-:-:-:00 @P6 LD.E.CI.128 loadB0, [trackB]; + + // - + // - + // blkA - + // - + // - + // tidAX ---- trackA ---- loadA0 -------- loadA4 + + // load if tidAY < k (tidAY == 0 if mod 4 not mod 8) + -:-:-:-:00 ISETP.LT.AND P5, PT, tidAY, k, P5; + -:-:-:-:00 @P5 LD.E.CI.128 loadA0, [trackA]; + + // bDoRemainder = k & 7 && k > 8 + -:-:-:-:00 LOP.AND k_and, k, 7; + -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT; + -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1; + + -:-:-:-:00 @!P6 LDS.128 loadB0, [RZ + addr_zero]; + -:-:-:-:00 @!P5 LDS.128 loadA0, [RZ + addr_zero]; + + // ---------------------- + // ---------------------- + // ---------------------- tidBY + // ----- writeBS ---- loadB0 + // tidBX + + -:-:-:-:00 STS.128 [writeBs], loadB0; + + // ------------------ + // ------------------ tidAY 0, 4 + // ------------------ + // ------ writeAS - loadA0 + // ---------------- loadA1 + // ---------------- loadA2 + // ---------------- loadA3 + // tidAX + -:-:-:-:00 STS [writeAs + 4x<0*128>], loadA0; + -:-:-:-:00 STS [writeAs + 4x<1*128>], loadA1; + -:-:-:-:00 STS [writeAs + 4x<2*128>], loadA2; + -:-:-:-:00 STS [writeAs + 4x<3*128>], loadA3; + + -:-:-:-:00 IADD trackB0.CC, trackB0, param_ldb8; + -:-:-:-:00 IADD.X trackB1, trackB1, RZ; + + -:-:-:-:00 IADD trackA0.CC, trackA0, 4x<8>; + -:-:-:-:00 IADD.X trackA1, trackA1, RZ; + }; + + +// TODO(keren): double buffer? +-:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>; +-:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>; +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 LOP.XOR writeAs, writeAs, 4x<128*8*2>; +-:-:-:-:00 LOP.XOR writeBs, writeBs, 4x<128*8*2>; + +// instruction align + + + my $k_end = 16; + our %insert = + ( + # P0 must be the topest + j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + j0c62 => "-:G:D:-:00 \@P2 LDG.E.CI.128 loadA0, [trackA];\n", + j0c63 => "-:G:D:-:00 \@P3 LDG.E.CI.128 loadB0, [trackB];\n", + + j1c47 => "-:-:-:-:00 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n", + j1c53 => "-:-:-:-:00 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + + j2c47 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n", + j2c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j3c47 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j3c53 => "-:-:-:-:00 IADD32I k, k, -8;\n", + + j5c47 => "T:-:D:S:00 TEXDEPBAR 0x1;\n", + j5c53 => "-:-:D:S:00 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n", + j5c61 => "-:-:D:S:00 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n", + j5c62 => "-:-:D:S:00 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n", + j5c63 => "-:-:D:S:00 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n", + + j6c47 => "T:-:D:S:00 TEXDEPBAR 0x0;\n", + j6c53 => "-:-:D:S:00 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n", + j6c63 => "T:-:D:S:00 BAR.SYNC 0x0;\n", + + j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n", + j7c53 => "-:-:-:-:00 \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n", + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n". + "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + + diff --git a/Kernel/SGEMM/Kepler/sgemm_nt_128x128.cu b/Kernel/SGEMM/Kepler/sgemm_nt_128x128.cu new file mode 100644 index 0000000..663c184 --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_nt_128x128.cu @@ -0,0 +1,25 @@ +extern "C" +__global__ void __launch_bounds__(256) sgemm_nt_128x128 +( + const float* param_A, + const float* param_B, + float* param_C, + float param_alpha, + float param_beta, + int param_lda, + int param_ldb, + int param_ldc, + int param_m, + int param_n, + int param_k + ) { + __shared__ float share[128 * 8 * 4 + 32]; + + int tid = threadIdx.x; + + share[tid] = 1; + + __syncthreads(); + + param_C[tid] = share[255 - tid]; +} diff --git a/Kernel/SGEMM/Kepler/sgemm_nt_128x128.sass b/Kernel/SGEMM/Kepler/sgemm_nt_128x128.sass new file mode 100644 index 0000000..eb48e24 --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_nt_128x128.sass @@ -0,0 +1,247 @@ +# Kernel: sgemm_nt_128x128 + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_A[0] : c[0x0][0x140] + param_A[1] : c[0x0][0x144] + param_B[0] : c[0x0][0x148] + param_B[1] : c[0x0][0x14c] + param_C[0] : c[0x0][0x150] + param_C[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_lda : c[0x0][0x160] + param_ldb : c[0x0][0x164] + param_ldc : c[0x0][0x168] + param_m : c[0x0][0x16c] + param_n : c[0x0][0x170] + param_k : c[0x0][0x174] + + + + + 64-95 ~ blkA, blkB, blkZ, tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, tid127, txa, txb, xmad_ta, xmad_tb, tid128 + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 : j0Ay<0-3> + 68-71 : j0Bx<0-3> + 72-75 : j0Ay<4-7> + 76-79 : j0Bx<4-7> + 80-83 : j1Ay<0-3> + 84-87 : j1Bx<0-3> + 88-91 : j1Ay<4-7> + 92-95 : j1Bx<4-7> + + 96-103 : loadA<0-3>, loadB<0-3> + 112-115 : trackA<0-1>, trackB<0-1> + + 116-122 ~ writeS, k, tidY, ta, tb, loop + 123-127 ~ readAs, readBs, tid, k_and + 128-135 ~ tmp_data, tmp_shl, tmp_param0, tmp_param1 + 144-150 ~ k1, k2, k3 + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkA, SR_CTAID.Y; +-:-:-:-:00 S2R blkB, SR_CTAID.Z; +-:-:-:-:00 S2R blkZ, SR_CTAID.X; + +-:-:-:-:00 MOV k, param_k; +-:-:-:-:00 MOV ldaz, RZ; +-:-:-:-:00 MOV ldbz, RZ; +-:-:-:-:00 MOV ldcz, RZ; +-:-:-:-:00 LOP.AND tid1, tid, 1; + +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15); + +-:-:-:-:00 MOV lda, param_lda; +-:-:-:-:00 MOV ldb, param_ldb; + +// tidY = tid1 << 2 +-:-:-:-:00 SHL tidY, tid1, 2; + +// tidX = tid >> 1 +-:-:-:-:00 SHR.U32 tidX, tid, 1; + +// trackA += 4 * ((blkA * 128 + tidX) * lda + tidY) +-:-:-:-:00 ISCADD txa, blkA, tidX, 7; +-:-:-:-:00 IMAD ta, lda, txa, tidY; +-:-:-:-:00 IMAD ta, ldaz, blkZ, ta; +-:-:-:-:00 MOV tmp_param0, param_A[0]; +-:-:-:-:00 MOV tmp_param1, param_A[1]; +-:-:-:-:00 SHL tmp_shl, ta, 0x2; +-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1; + +// trackB += 4 * ((blkB * 128 + tidX) * ldb + tidY) +-:-:-:-:00 ISCADD txb, blkB, tidX, 7; +-:-:-:-:00 IMAD tb, ldb, txb, tidY; +-:-:-:-:00 IMAD tb, ldbz, blkZ, tb; +-:-:-:-:00 MOV tmp_param0, param_B[0]; +-:-:-:-:00 MOV tmp_param1, param_B[1]; +-:-:-:-:00 SHL tmp_shl, tb, 0x2; +-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1; + +-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT; +-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = 4 * (128 * tidY + tidX) +-:-:-:-:00 ISCADD writeS, tidY, tidX, 7; +-:-:-:-:00 SHL writeS, writeS, 2; + +-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +-:-:-:-:00 LOP.AND readAs, tid, 0x70; +-:-:-:-:00 SHR.U32 readAs, readAs, 3; +-:-:-:-:00 LOP.OR readAs, readAs, tid1; +-:-:-:-:00 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 SHR.U32 readBs, tid128, 4; +-:-:-:-:00 LOP.OR readBs, readBs, tid7; +-:-:-:-:00 ISCADD readBs, readBs, 4x<128*8>, 4; + +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + +REMAINDER: + + + return q{ + -:-:-:-:00 IADD k1, tidY, 1; + -:-:-:-:00 IADD k2, tidY, 2; + -:-:-:-:00 IADD k3, tidY, 3; + + -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P5; + -:-:-:-:00 ISETP.LT.AND P1, PT, k1, k, P5; + -:-:-:-:00 ISETP.LT.AND P2, PT, k2, k, P5; + -:-:-:-:00 ISETP.LT.AND P3, PT, k3, k, P5; + + -:-:-:-:00 @P0 LD.E.CI loadA0, [trackA + 4x<0>]; + -:-:-:-:00 @P1 LD.E.CI loadA1, [trackA + 4x<1>]; + -:-:-:-:00 @P2 LD.E.CI loadA2, [trackA + 4x<2>]; + -:-:-:-:00 @P3 LD.E.CI loadA3, [trackA + 4x<3>]; + + -:-:-:-:00 @!P0 MOV loadA0, RZ; + -:-:-:-:00 @!P1 MOV loadA1, RZ; + -:-:-:-:00 @!P2 MOV loadA2, RZ; + -:-:-:-:00 @!P3 MOV loadA3, RZ; + + -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P6; + -:-:-:-:00 ISETP.LT.AND P1, PT, k1, k, P6; + -:-:-:-:00 ISETP.LT.AND P2, PT, k2, k, P6; + -:-:-:-:00 ISETP.LT.AND P3, PT, k3, k, P6; + + -:-:-:-:00 @P0 LD.E.CI loadB0, [trackB + 4x<0>]; + -:-:-:-:00 @P1 LD.E.CI loadB1, [trackB + 4x<1>]; + -:-:-:-:00 @P2 LD.E.CI loadB2, [trackB + 4x<2>]; + -:-:-:-:00 @P3 LD.E.CI loadB3, [trackB + 4x<3>]; + + -:-:-:-:00 @!P0 MOV loadB0, RZ; + -:-:-:-:00 @!P1 MOV loadB1, RZ; + -:-:-:-:00 @!P2 MOV loadB2, RZ; + -:-:-:-:00 @!P3 MOV loadB3, RZ; + + // bDoRemainder = k & 7 && k > 8 + -:-:-:-:00 LOP.AND k_and, k, 7; + -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT; + + -:G:-:-:15 STS [writeS + 4x<0*128>], loadA0; + -:G:-:-:15 STS [writeS + 4x<1*128>], loadA1; + -:G:-:-:15 STS [writeS + 4x<2*128>], loadA2; + -:G:-:-:15 STS [writeS + 4x<3*128>], loadA3; + + -:G:-:-:15 STS [writeS + 4x< 8*128>], loadB0; + -:G:-:-:15 STS [writeS + 4x< 9*128>], loadB1; + -:G:-:-:15 STS [writeS + 4x<10*128>], loadB2; + -:G:-:-:15 STS [writeS + 4x<11*128>], loadB3; + + -:-:-:-:00 IADD trackA0.CC, trackA0, 4x<8>; + -:-:-:-:00 IADD.X trackA1, trackA1, RZ; + + -:-:-:-:00 IADD trackB0.CC, trackB0, 4x<8>; + -:-:-:-:00 IADD.X trackB1, trackB1, RZ; + + -:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>; + -:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>; + -:-:-:-:00 BAR.SYNC 0; + -:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>; + + -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1; + }; + + + + our %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, 16, P5;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, 16, P6;\n", + j0c61 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, 16, PT;\n", + j0c62 => "-:-:-:-:00 \@P2 LD.E.CI loadA0, [trackA + 4x<0>];\n", + + j1c47 => "-:-:-:-:00 \@P2 LD.E.CI loadA1, [trackA + 4x<1>];\n", + j1c53 => "-:-:-:-:00 \@P2 LD.E.CI loadA2, [trackA + 4x<2>];\n", + j1c61 => "-:-:-:-:00 \@P2 LD.E.CI loadA3, [trackA + 4x<3>];\n", + j1c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB0, [trackB + 4x<0>];\n", + + j2c47 => "-:-:-:-:00 \@P3 LD.E.CI loadB1, [trackB + 4x<1>];\n", + j2c53 => "-:-:-:-:00 IADD32I k, k, -8;\n", + j2c61 => "-:-:-:-:00 \@P3 LD.E.CI loadB2, [trackB + 4x<2>];\n", + j2c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB3, [trackB + 4x<3>];\n", + + j3c47 => "-:-:-:-:00 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n", + j3c53 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j4c47 => "-:-:D:S:02 \@P0 STS [writeS + 4x<0*128>], loadA0;\n", + j4c53 => "-:-:D:S:02 \@P0 STS [writeS + 4x<1*128>], loadA1;\n", + j4c61 => "-:-:D:-:07 \@P0 STS [writeS + 4x<2*128>], loadA2;\n", + j4c62 => "-:-:D:-:07 \@P0 STS [writeS + 4x<3*128>], loadA3;\n", + + j5c47 => "-:-:D:S:02 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n", + j5c53 => "-:-:D:S:02 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n", + j5c61 => "-:-:D:-:07 \@P0 STS [writeS + 4x<10*128>], loadB2;\n", + j5c62 => "-:-:D:-:07 \@P0 STS [writeS + 4x<11*128>], loadB3;\n", + + j6c47 => "-:-:-:-:00 \@P3 IADD trackB0.CC, trackB0, 4x<8>;\n", + j6c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n", + j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n", + + j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n", + + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n". + "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + + diff --git a/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.cu b/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.cu new file mode 100644 index 0000000..7cf98a6 --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.cu @@ -0,0 +1,25 @@ +extern "C" +__global__ void __launch_bounds__(256) sgemm_nt_128x128_vec +( + const float* param_A, + const float* param_B, + float* param_C, + float param_alpha, + float param_beta, + int param_lda, + int param_ldb, + int param_ldc, + int param_m, + int param_n, + int param_k + ) { + __shared__ float share[128 * 8 * 4 + 32]; + + int tid = threadIdx.x; + + share[tid] = 1; + + __syncthreads(); + + param_C[tid] = share[255 - tid]; +} diff --git a/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.sass b/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.sass new file mode 100644 index 0000000..4084d3d --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.sass @@ -0,0 +1,222 @@ +# Kernel: sgemm_nt_128x128_vec + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_A[0] : c[0x0][0x140] + param_A[1] : c[0x0][0x144] + param_B[0] : c[0x0][0x148] + param_B[1] : c[0x0][0x14c] + param_C[0] : c[0x0][0x150] + param_C[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_lda : c[0x0][0x160] + param_ldb : c[0x0][0x164] + param_ldc : c[0x0][0x168] + param_m : c[0x0][0x16c] + param_n : c[0x0][0x170] + param_k : c[0x0][0x174] + + + + + 64-91 ~ blkA, blkB, blkZ, tidX, lda, ldb, ldaz, ldbz, tid1, tid7, txa, txb, tid128, ta, tb, tmp_shl + 92-93 : tmp_param0, tmp_param1 + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 : j0Ay<0-3> + 68-71 : j0Bx<0-3> + 72-75 : j0Ay<4-7> + 76-79 : j0Bx<4-7> + 80-83 : j1Ay<0-3> + 84-87 : j1Bx<0-3> + 88-91 : j1Ay<4-7> + 92-95 : j1Bx<4-7> + + 96-103 : loadA<0-3>, loadB<0-3> + 104-107 : trackA<0-1>, trackB<0-1> + + 108-111 ~ writeS, k, k_and, tidY + 117 ~ readAs + 116 ~ readBs + 115 ~ tid + + 64-75 ~ ldc, ci, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-101 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkA, SR_CTAID.Y; +-:-:-:-:00 S2R blkB, SR_CTAID.Z; +-:-:-:-:00 S2R blkZ, SR_CTAID.X; + +-:-:-:-:00 MOV k, param_k; +-:-:-:-:00 MOV ldaz, RZ; +-:-:-:-:00 MOV ldbz, RZ; +-:-:-:-:00 MOV ldcz, RZ; +-:-:-:-:00 LOP.AND tid1, tid, 1; + +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15); + +-:-:-:-:00 MOV lda, param_lda; +-:-:-:-:00 MOV ldb, param_ldb; + +// tidY = tid1 << 2 +-:-:-:-:00 SHL tidY, tid1, 2; + +// tidX = tid >> 1 +-:-:-:-:00 SHR.U32 tidX, tid, 1; + +// trackA += 4 * ((blkA * 128 + tidX) * lda + tidY) +-:-:-:-:00 ISCADD txa, blkA, tidX, 7; +-:-:-:-:00 IMAD ta, lda, txa, tidY; +-:-:-:-:00 IMAD ta, ldaz, blkZ, ta; +-:-:-:-:00 MOV tmp_param0, param_A[0]; +-:-:-:-:00 MOV tmp_param1, param_A[1]; +-:-:-:-:00 SHL tmp_shl, ta, 0x2; +-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1; + +// trackB += 4 * ((blkB * 128 + tidX) * ldb + tidY) +-:-:-:-:00 ISCADD txb, blkB, tidX, 7; +-:-:-:-:00 IMAD tb, ldb, txb, tidY; +-:-:-:-:00 IMAD tb, ldbz, blkZ, tb; +-:-:-:-:00 MOV tmp_param0, param_B[0]; +-:-:-:-:00 MOV tmp_param1, param_B[1]; +-:-:-:-:00 SHL tmp_shl, tb, 0x2; +-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1; + +-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT; +-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = 4 * (128 * tidY + tidX) +-:-:-:-:00 ISCADD writeS, tidY, tidX, 7; +-:-:-:-:00 SHL writeS, writeS, 2; + +-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +-:-:-:-:00 LOP.AND readAs, tid, 0x70; +-:-:-:-:00 SHR.U32 readAs, readAs, 3; +-:-:-:-:00 LOP.OR readAs, readAs, tid1; +-:-:-:-:00 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 SHR.U32 readBs, tid128, 4; +-:-:-:-:00 LOP.OR readBs, readBs, tid7; +-:-:-:-:00 ISCADD readBs, readBs, 4x<128*8>, 4; + +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + +REMAINDER: + + + return q{ + // k must be multiple of 4 + // load if tidY < k (tidY == 0 if mod 4 not mod 8) + -:-:-:-:00 ISETP.LT.AND P5, PT, tidY, k, P5; + -:-:-:-:00 @P5 LD.E.CI.128 loadA0, [trackA + 4x<0>]; + + // load if tidY < k (tidY == 0 if mod 4 not mod 8) + -:-:-:-:00 ISETP.LT.AND P6, PT, tidY, k, P6; + -:-:-:-:00 @P6 LD.E.CI.128 loadB0, [trackB + 4x<0>]; + + -:-:-:-:00 @!P5 LDS.128 loadA0, [RZ + addr_zero]; + -:-:-:-:00 @!P6 LDS.128 loadB0, [RZ + addr_zero]; + + // bDoRemainder = k & 7 && k > 8 + -:-:-:-:00 LOP.AND k_and, k, 7; + -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT; + -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1; + + -:-:-:-:00 STS [writeS + 4x<0*128>], loadA0; + -:-:-:-:00 STS [writeS + 4x<1*128>], loadA1; + -:-:-:-:00 STS [writeS + 4x<2*128>], loadA2; + -:-:-:-:00 STS [writeS + 4x<3*128>], loadA3; + + -:-:-:-:00 STS [writeS + 4x< 8*128>], loadB0; + -:-:-:-:00 STS [writeS + 4x< 9*128>], loadB1; + -:-:-:-:00 STS [writeS + 4x<10*128>], loadB2; + -:-:-:-:00 STS [writeS + 4x<11*128>], loadB3; + + -:-:-:-:00 IADD trackA0.CC, trackA0, 4x<8>; + -:-:-:-:00 IADD.X trackA1, trackA1, RZ; + + -:-:-:-:00 IADD trackB0.CC, trackB0, 4x<8>; + -:-:-:-:00 IADD.X trackB1, trackB1, RZ; + + -:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>; + -:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>; + -:-:-:-:00 BAR.SYNC 0; + -:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>; + }; + + + + my $k_end = 16; + our %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + j0c61 => "-:G:D:-:07 \@P2 LDG.E.CI.128 loadA0, [trackA];\n", + j0c62 => "-:G:D:-:07 \@P3 LDG.E.CI.128 loadB0, [trackB];\n", + + j1c47 => "-:-:-:-:00 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n", + j1c53 => "-:-:-:-:00 \@P3 IADD trackB0.CC, trackB0, 4x<8>;\n", + + j2c47 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n", + j2c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j3c47 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j3c53 => "-:-:-:-:00 IADD32I k, k, -8;\n", + + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n", + j6c63 => "T:-:D:S:00 TEXDEPBAR 0x0;\n". + "-:G:-:-:15 \@P0 STS [writeS + 4x<8*128>], loadB0;\n". + "-:G:-:-:15 \@P0 STS [writeS + 4x<9*128>], loadB1;\n". + "-:G:-:-:15 \@P0 STS [writeS + 4x<10*128>], loadB2;\n". + "-:G:-:-:15 \@P0 STS [writeS + 4x<11*128>], loadB3;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 NOP;\n". + "-:-:-:-:00 NOP;\n". + "-:G:-:-:15 \@P0 STS [writeS + 4x<0*128>], loadA0;\n". + "-:G:-:-:15 \@P0 STS [writeS + 4x<1*128>], loadA1;\n". + "-:G:-:-:15 \@P0 STS [writeS + 4x<2*128>], loadA2;\n". + "-:G:-:-:15 \@P0 STS [writeS + 4x<3*128>], loadA3;\n". + "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n". + "-:-:-:-:00 NOP;\n". + "T:-:D:S:00 \@P0 BAR.SYNC 0;\n", + + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n". + "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + + + diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x128.cu b/Kernel/SGEMM/Kepler/sgemm_tn_128x128.cu new file mode 100644 index 0000000..c17da1a --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x128.cu @@ -0,0 +1,25 @@ +extern "C" +__global__ void __launch_bounds__(256) sgemm_tn_128x128 +( + const float* param_A, + const float* param_B, + float* param_C, + float param_alpha, + float param_beta, + int param_lda8, + int param_ldb8, + int param_ldc, + int param_m, + int param_n, + int param_k + ) { + __shared__ float share[128 * 8 * 4 + 32]; + + int tid = threadIdx.x; + + share[tid] = 1; + + __syncthreads(); + + param_C[tid] = share[255 - tid]; +} diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x128.sass b/Kernel/SGEMM/Kepler/sgemm_tn_128x128.sass new file mode 100644 index 0000000..0c03a6e --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x128.sass @@ -0,0 +1,241 @@ +# Kernel: sgemm_tn_128x128 + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_A[0] : c[0x0][0x140] + param_A[1] : c[0x0][0x144] + param_B[0] : c[0x0][0x148] + param_B[1] : c[0x0][0x14c] + param_C[0] : c[0x0][0x150] + param_C[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_lda8 : c[0x0][0x160] + param_ldb8 : c[0x0][0x164] + param_ldc : c[0x0][0x168] + param_m : c[0x0][0x16c] + param_n : c[0x0][0x170] + param_k : c[0x0][0x174] + + + + + 64-95 ~ blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, tid1, tid7, tidX, blk, tid31, tid128 + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + + 64-67 : j0Ay<0-3> + 68-71 : j0Bx<0-3> + 72-75 : j0Ay<4-7> + 76-79 : j0Bx<4-7> + 80-83 : j1Ay<0-3> + 84-87 : j1Bx<0-3> + 88-91 : j1Ay<4-7> + 92-95 : j1Bx<4-7> + + 96-103 : loadA<0-3>, loadB<0-3> + + 104-107 : trackA<0-1>, trackB<0-1> + + 108-121 ~ writeS, lda8, k, tidY, txa, txb, ta, tb, loop + 122-127 ~ readAs, readBs, tid + 128-135 ~ tmp_data, tmp_shl, tmp_param0, tmp_param1 + 144-155 ~ x<1-3>, y<1-3> + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkA, SR_CTAID.Y; +-:-:-:-:00 S2R blkB, SR_CTAID.Z; +-:-:-:-:00 S2R blkZ, SR_CTAID.X; + +-:-:-:-:00 MOV k, param_k; +-:-:-:-:00 MOV ldaz, RZ; +-:-:-:-:00 MOV ldbz, RZ; +-:-:-:-:00 MOV ldcz, RZ; +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15); + + +// tidX = (tid & 31) << 2 +// tidY = (tid >> 5) & 7 +-:-:-:-:00 LOP.AND tid31, tid, 31; +-:-:-:-:00 SHL tidX, tid31, 2; +-:-:-:-:00 BFE.U32 tidY, tid, 0x305; // 3 bits at position 5 + +-:-:-:-:00 MOV lda, param_lda8; +-:-:-:-:00 MOV ldb, param_ldb8; +-:-:-:-:00 SHR.U32 lda, lda, 5; +-:-:-:-:00 SHR.U32 ldb, ldb, 5; + +// trackA += (blkA*128 + lda*tidY + tidX) * 2 +-:-:-:-:00 ISCADD txa, blkA, tidX, 7; +-:-:-:-:00 IMAD ta, lda, tidY, txa; +-:-:-:-:00 IMAD ta, ldaz, blkZ, ta; +-:-:-:-:00 MOV tmp_param0, param_A[0]; +-:-:-:-:00 MOV tmp_param1, param_A[1]; +-:-:-:-:00 SHL tmp_shl, ta, 0x2; +-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1; + +-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// trackB += (blkB*128 + ldb*tidY + tidX) * 2 +-:-:-:-:00 ISCADD txb, blkB, tidX, 7; +-:-:-:-:00 IMAD tb, ldb, tidY, txb; +-:-:-:-:00 IMAD tb, ldbz, blkZ, tb; +-:-:-:-:00 MOV tmp_param0, param_B[0]; +-:-:-:-:00 MOV tmp_param1, param_B[1]; +-:-:-:-:00 SHL tmp_shl, tb, 0x2; +-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1; + +-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = (128*tidY + tidX) * 4 +-:-:-:-:00 ISCADD writeS, tidY, tidX, 7; +-:-:-:-:00 SHL writeS, writeS, 2; +-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 LOP.AND readAs, tid, 0x70; +-:-:-:-:00 SHR.U32 readAs, readAs, 3; +-:-:-:-:00 LOP.OR readAs, readAs, tid1; +-:-:-:-:00 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 SHR.U32 readBs, tid128, 4; +-:-:-:-:00 LOP.OR readBs, readBs, tid7; +-:-:-:-:00 ISCADD readBs, readBs, 4x<128*8>, 4; + +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + +REMAINDER: + + + return q{ + // doLoadA = tidY < k && txa < m + // doLoadB = tidY < k && txb < n + -:-:-:-:00 IADD x1, txa, 1; + -:-:-:-:00 IADD x2, txa, 2; + -:-:-:-:00 IADD x3, txa, 3; + -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P5; + -:-:-:-:00 ISETP.LT.AND P1, PT, x1, param_m, P0; + -:-:-:-:00 ISETP.LT.AND P2, PT, x2, param_m, P0; + -:-:-:-:00 ISETP.LT.AND P3, PT, x3, param_m, P0; + + -:-:-:-:00 @P0 LD.E.CI loadA0, [trackA + 4x<0>]; + -:-:-:-:00 @P1 LD.E.CI loadA1, [trackA + 4x<1>]; + -:-:-:-:00 @P2 LD.E.CI loadA2, [trackA + 4x<2>]; + -:-:-:-:00 @P3 LD.E.CI loadA3, [trackA + 4x<3>]; + + -:-:-:-:00 @!P0 MOV loadA0, RZ; + -:-:-:-:00 @!P1 MOV loadA1, RZ; + -:-:-:-:00 @!P2 MOV loadA2, RZ; + -:-:-:-:00 @!P3 MOV loadA3, RZ; + + -:-:-:-:00 IADD y1, txb, 1; + -:-:-:-:00 IADD y2, txb, 2; + -:-:-:-:00 IADD y3, txb, 3; + -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P6; + -:-:-:-:00 ISETP.LT.AND P1, PT, y1, param_n, P0; + -:-:-:-:00 ISETP.LT.AND P2, PT, y2, param_n, P0; + -:-:-:-:00 ISETP.LT.AND P3, PT, y3, param_n, P0; + + -:-:-:-:00 @P0 LD.E.CI loadB0, [trackB + 4x<0>]; + -:-:-:-:00 @P1 LD.E.CI loadB1, [trackB + 4x<1>]; + -:-:-:-:00 @P2 LD.E.CI loadB2, [trackB + 4x<2>]; + -:-:-:-:00 @P3 LD.E.CI loadB3, [trackB + 4x<3>]; + + -:-:-:-:00 @!P0 MOV loadB0, RZ; + -:-:-:-:00 @!P1 MOV loadB1, RZ; + -:-:-:-:00 @!P2 MOV loadB2, RZ; + -:-:-:-:00 @!P3 MOV loadB3, RZ; + + -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, PT; + }; + + +-:-:-:-:00 STS.128 [writeS + 4x<0*128>], loadA0; + +-:-:-:-:00 IADD trackA0.CC, trackA0, param_lda8; +-:-:-:-:00 IADD.X trackA1, trackA1, RZ; + +-:-:-:-:00 STS.128 [writeS + 4x<8*128>], loadB0; + +-:-:-:-:00 IADD trackB0.CC, trackB0, param_ldb8; + +-:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>; +-:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>; +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>; + +-:-:-:-:00 IADD.X trackB1, trackB1, RZ; + + + my $k_end = 24; + our %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + j0c61 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c62 => "-:-:-:-:00 \@P2 LD.E.CI loadA0, [trackA + 4x<0>];\n", + j0c63 => "-:-:-:-:00 \@P2 LD.E.CI loadA1, [trackA + 4x<1>];\n", + + j1c47 => "-:-:-:-:00 \@P2 LD.E.CI loadA2, [trackA + 4x<2>];\n", + j1c53 => "-:-:-:-:00 \@P2 LD.E.CI loadA3, [trackA + 4x<3>];\n", + + j2c47 => "-:-:-:-:00 \@P3 LD.E.CI loadB0, [trackB + 4x<0>];\n", + j2c53 => "-:-:-:-:00 \@P3 LD.E.CI loadB1, [trackB + 4x<1>];\n", + j2c61 => "-:-:-:-:00 \@P3 LD.E.CI loadB2, [trackB + 4x<2>];\n", + j2c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB3, [trackB + 4x<3>];\n", + + j3c47 => "-:-:-:-:00 \@P2 IADD trackA0.CC, trackA0, param_lda8;\n", + j3c53 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j4c53 => "-:-:D:S:02 \@P0 STS.128 [writeS + 4x<0*128>], loadA0;\n", + + j5c53 => "-:-:D:S:02 \@P0 STS.128 [writeS + 4x<8*128>], loadB0;\n", + + j6c47 => "-:-:-:-:00 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n", + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n", + j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n", + + j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n", + j7c53 => "-:-:-:-:00 IADD32I k, k, -8;\n", + + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n". + "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.cu b/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.cu new file mode 100644 index 0000000..28aa136 --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.cu @@ -0,0 +1,26 @@ +extern "C" +__global__ void __launch_bounds__(256) sgemm_tn_128x128_vec +( + const float* param_A, + const float* param_B, + float* param_C, + float param_alpha, + float param_beta, + int param_lda8, + int param_ldb8, + int param_ldc, + int param_m, + int param_n, + int param_k + ) { + __shared__ float share[128 * 8 * 4 + 32]; + + int tid = threadIdx.x; + + share[tid] = 1; + + __syncthreads(); + + param_C[tid] = share[255 - tid]; +} + diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.sass b/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.sass new file mode 100644 index 0000000..bc896ba --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.sass @@ -0,0 +1,212 @@ +# Kernel: sgemm_tn_128x128_vec + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_A[0] : c[0x0][0x140] + param_A[1] : c[0x0][0x144] + param_B[0] : c[0x0][0x148] + param_B[1] : c[0x0][0x14c] + param_C[0] : c[0x0][0x150] + param_C[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_lda8 : c[0x0][0x160] + param_ldb8 : c[0x0][0x164] + param_ldc : c[0x0][0x168] + param_m : c[0x0][0x16c] + param_n : c[0x0][0x170] + param_k : c[0x0][0x174] + + + + + 64-91 ~ blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, tid1, tid7, tidX, tid31, tid128, txa, txb, ta, tb, tmp_shl + 92-93 : tmp_param<0-1> + + 0-63 : czero<00-63> + + 1, 4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0 + 5, 0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1 + 3, 6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2 + 7, 2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3 + 9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4 + 13, 8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5 + 11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6 + 15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7 + + 64-67 : j0Ay<0-3> + 68-71 : j0Bx<0-3> + 72-75 : j0Ay<4-7> + 76-79 : j0Bx<4-7> + 80-83 : j1Ay<0-3> + 84-87 : j1Bx<0-3> + 88-91 : j1Ay<4-7> + 92-95 : j1Bx<4-7> + + 96-103 : loadA<0-3>, loadB<0-3> + + 104-107 : trackA<0-1>, trackB<0-1> + + 108-111 ~ writeS, k, k_and, tidY + 117 ~ readAs + 116 ~ readBs + 115 ~ tid + + 64-75 ~ ldc, ci, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-3>, c<4-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-101 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkA, SR_CTAID.Y; +-:-:-:-:00 S2R blkB, SR_CTAID.Z; +-:-:-:-:00 S2R blkZ, SR_CTAID.X; + +-:-:-:-:00 MOV k, param_k; +-:-:-:-:00 MOV ldaz, RZ; +-:-:-:-:00 MOV ldbz, RZ; +-:-:-:-:00 MOV ldcz, RZ; +-:-:-:-:00 STS.128 [RZ + addr_zero], RZ; + + join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15); + + +// tidX = (tid & 31) << 2 +// tidY = (tid >> 5) & 7 +-:-:-:-:00 LOP.AND tid31, tid, 31; +-:-:-:-:00 SHL tidX, tid31, 2; +-:-:-:-:00 BFE.U32 tidY, tid, 0x305; // 3 bits at position 5 + +-:-:-:-:00 MOV lda, param_lda8; +-:-:-:-:00 MOV ldb, param_ldb8; +-:-:-:-:00 SHR.U32 lda, lda, 5; +-:-:-:-:00 SHR.U32 ldb, ldb, 5; + +// trackA += (blkA*128 + lda*tidY + tidX) * 2 +-:-:-:-:00 ISCADD txa, blkA, tidX, 7; +-:-:-:-:00 IMAD ta, lda, tidY, txa; +-:-:-:-:00 IMAD ta, ldaz, blkZ, ta; +-:-:-:-:00 MOV tmp_param0, param_A[0]; +-:-:-:-:00 MOV tmp_param1, param_A[1]; +-:-:-:-:00 SHL tmp_shl, ta, 0x2; +-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1; + +-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// trackB += (blkB*128 + ldb*tidY + tidX) * 2 +-:-:-:-:00 ISCADD txb, blkB, tidX, 7; +-:-:-:-:00 IMAD tb, ldb, tidY, txb; +-:-:-:-:00 IMAD tb, ldbz, blkZ, tb; +-:-:-:-:00 MOV tmp_param0, param_B[0]; +-:-:-:-:00 MOV tmp_param1, param_B[1]; +-:-:-:-:00 SHL tmp_shl, tb, 0x2; +-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0; +-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1; + +-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = (128*tidY + tidX) * 4 +-:-:-:-:00 ISCADD writeS, tidY, tidX, 7; +-:-:-:-:00 SHL writeS, writeS, 2; +-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 LOP.AND readAs, tid, 0x70; +-:-:-:-:00 SHR.U32 readAs, readAs, 3; +-:-:-:-:00 LOP.OR readAs, readAs, tid1; +-:-:-:-:00 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +-:-:-:-:00 LOP.AND tid128, tid, 128; +-:-:-:-:00 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 SHR.U32 readBs, tid128, 4; +-:-:-:-:00 LOP.OR readBs, readBs, tid7; +-:-:-:-:00 ISCADD readBs, readBs, 4x<128*8>, 4; + +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; +-:-:-:-:00 NOP; + +REMAINDER: + + + return q{ + // bDoRemainder = k & 7 && k > 8 + -:-:-:-:00 LOP.AND k_and, k, 7; + -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT; + -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1; + + // doLoad = tidY < k && txa|txb < n|m + -:-:-:-:00 ISETP.LT.AND P2, PT, tidY, k, P5; + -:-:-:-:00 ISETP.LT.AND P3, PT, tidY, k, P6; + + -:-:-:-:00 @P2 LD.E.CI.128 loadA0, [trackA]; + -:-:-:-:00 @P3 LD.E.CI.128 loadB0, [trackB]; + + -:-:-:-:00 @!P2 LDS.128 loadA0, [RZ + addr_zero]; + -:-:-:-:00 @!P3 LDS.128 loadB0, [RZ + addr_zero]; + // Vec 4 and scalar loads + }; + + + +-:-:-:-:00 STS.128 [writeS + 4x<0*128>], loadA0; + +-:-:-:-:00 IADD trackA0.CC, trackA0, param_lda8; +-:-:-:-:00 IADD.X trackA1, trackA1, RZ; + +-:-:-:-:00 STS.128 [writeS + 4x<8*128>], loadB0; + +-:-:-:-:00 IADD trackB0.CC, trackB0, param_ldb8; + +-:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>; +-:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>; +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>; + +-:-:-:-:00 IADD.X trackB1, trackB1, RZ; + + + my $k_end = 16; + our %insert = + ( + j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + j0c61 => "-:G:D:-:07 \@P2 LDG.E.128 loadA, [trackA];\n", + j0c62 => "-:G:D:-:07 \@P3 LDG.E.128 loadB, [trackB];\n", + + j1c47 => "-:-:-:-:00 \@P2 IADD trackA0.CC, trackA0, param_lda8;\n", + j1c53 => "-:-:-:-:00 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + + j2c47 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n", + j2c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j3c47 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j3c53 => "-:-:-:-:00 IADD32I k, k, -8;\n", + + j5c63 => "T:-:D:S:00 TEXDEPBAR 0x0;\n", + j6c47 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*128>], loadA0;\n", + j6c53 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*128>], loadB0;\n", + + j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n", + j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n", + j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n", + + j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n", + j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n" . + "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x32.sass b/Kernel/SGEMM/Kepler/sgemm_tn_128x32.sass new file mode 100644 index 0000000..a5324ad --- /dev/null +++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x32.sass @@ -0,0 +1,422 @@ +# Kernel: sgemm_tn_128x32 + + + addr_zero : 4x<128*16*2 + 32*16*2> + szShareA : 128*16 + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +-:-:-:-:00 S2R tid, SR_TID.X; +-:-:-:-:00 S2R blkA, SR_CTAID.Y; +-:-:-:-:00 S2R blkB, SR_CTAID.Z; +-:-:-:-:00 S2R blkZ, SR_CTAID.X; + +-:-:-:-:00 MOV k, param_k; +-:-:-:-:00 MOV lda, param_lda8; +-:-:-:-:00 MOV ldb, param_ldb8; +-:-:-:-:00 SHR.U32 lda, lda, 5; +-:-:-:-:00 SHR.U32 ldb, ldb, 5; +-:-:-:-:00 MOV ldaz, param_ldaz; +-:-:-:-:00 MOV ldbz, param_ldbz; +-:-:-:-:00 SHL lda16, lda, 6; +-:-:-:-:00 SHL ldb16, ldb, 6; +-:-:-:-:00 SHL lda4, lda, 2; + +-:-:-:-:00 STS.128 [addr_zero], RZ; + + return join '', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = (tid & 31) << 2 +// tidAY = (tid >> 5) +-:-:-:-:00 LOP.AND tidAX, tid, 31; +-:-:-:-:00 SHL tidAX, tidAX, 2; +-:-:-:-:00 SHR.U32 tidAY, tid, 5; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +-:-:-:-:00 LOP.AND tidBX, tid, 7; +-:-:-:-:00 SHL tidBX, tidBX, 2; +-:-:-:-:00 SHR.U32 tidBY, tid, 3; + +// trackA += (blkA*128 + tidAX + lda*tidAY) * 4 +-:-:-:-:00 ISCADD txa, blkA, tidAX, 7; +-:-:-:-:00 XMAD.LO2 ta0, lda, tidAY, txa; +-:-:-:-:00 XMAD.LO2 ta0, ldaz, blkZ, ta0; +-:-:-:-:00 IADD ta1, ta0, lda4; +-:-:-:-:00 IADD ta2, ta1, lda4; +-:-:-:-:00 IADD ta3, ta2, lda4; + +-:-:-:-:00 LEA track0A0.CC, ta0, param_A[0], 2; +-:-:-:-:00 LEA.HI.X track0A1, ta0, param_A[1], RZ, 2; +-:-:-:-:00 LEA track1A0.CC, ta1, param_A[0], 2; +-:-:-:-:00 LEA.HI.X track1A1, ta1, param_A[1], RZ, 2; +-:-:-:-:00 LEA track2A0.CC, ta2, param_A[0], 2; +-:-:-:-:00 LEA.HI.X track2A1, ta2, param_A[1], RZ, 2; +-:-:-:-:00 LEA track3A0.CC, ta3, param_A[0], 2; +-:-:-:-:00 LEA.HI.X track3A1, ta3, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4 +-:-:-:-:00 ISCADD txb, blkB, tidBX, 5; +-:-:-:-:00 XMAD.LO2 tb, ldb, tidBY, txb; +-:-:-:-:00 XMAD.LO2 tb, ldbz, blkZ, tb; +-:-:-:-:00 LEA trackB0.CC, tb, param_B[0], 2; +-:-:-:-:00 LEA.HI.X trackB1, tb, param_B[1], RZ, 2; + +// writeAs = (tidAY*128 + tidAX) * 4 +-:-:-:-:00 ISCADD writeAs, tidAY, tidAX, 7; +-:-:-:-:00 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +-:-:-:-:00 ISCADD writeBs, tidBY, tidBX, 5; +-:-:-:-:00 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +-:-:-:-:00 LOP.AND tid1, tid, 1; +-:-:-:-:00 LOP.AND readAs, tid, 0x70; +-:-:-:-:00 SHR.U32 readAs, readAs, 3; +-:-:-:-:00 LOP.OR readAs, readAs, tid1; +-:-:-:-:00 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +-:-:-:-:00 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +-:-:-:-:00 ISCADD readBs, readBs, 4x, 4; + +-:-:-:-:00 MOV32I swapBuf, -4x; + +REMAINDER: + +-:-:-:-:00 IADD tidAY1, tidAY, 4; +-:-:-:-:00 IADD tidAY2, tidAY, 8; +-:-:-:-:00 IADD tidAY3, tidAY, 12; + + + our $vec; + return $vec ? q{ +-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT; +-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT; + +-:-:-:-:00 ISETP.LT.AND P0, PT, tidAY, k, P5; +-:-:-:-:00 ISETP.LT.AND P1, PT, tidAY1, k, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, tidAY2, k, P5; +-:-:-:-:00 ISETP.LT.AND P3, PT, tidAY3, k, P5; +-:-:-:-:00 ISETP.LT.AND P4, PT, tidBY, k, P6; + +-:-:-:-:00 @P0 LDG.E.CI.128 load0A, [track0A]; +-:-:-:-:00 @P1 LDG.E.CI.128 load1A, [track1A]; +-:-:-:-:00 @P2 LDG.E.CI.128 load2A, [track2A]; +-:-:-:-:00 @P3 LDG.E.CI.128 load3A, [track3A]; +-:-:-:-:00 @P4 LDG.E.CI.128 loadB, [trackB]; + +-:-:-:-:00 @!P0 LDS.U.128 load0A, [addr_zero]; +-:-:-:-:00 @!P1 LDS.U.128 load1A, [addr_zero]; +-:-:-:-:00 @!P2 LDS.U.128 load2A, [addr_zero]; +-:-:-:-:00 @!P3 LDS.U.128 load3A, [addr_zero]; +-:-:-:-:00 @!P4 LDS.U.128 loadB, [addr_zero]; + + } : q{ + +-:-:-:-:00 IADD txa1, txa, 1; +-:-:-:-:00 IADD txa2, txa, 2; +-:-:-:-:00 IADD txa3, txa, 3; + +-:-:-:-:00 ISETP.LT.AND P4, PT, tidAY, k, PT; +-:-:-:-:00 ISETP.LT.AND P0, PT, txa, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P4; +-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P4; + +-:-:-:-:00 @P0 LDG.E.CI load0A0, [track0A + 4x<0>]; +-:-:-:-:00 @P1 LDG.E.CI load0A1, [track0A + 4x<1>]; +-:-:-:-:00 @P2 LDG.E.CI load0A2, [track0A + 4x<2>]; +-:-:-:-:00 @P3 LDG.E.CI load0A3, [track0A + 4x<3>]; + +-:-:-:-:00 @!P0 MOV load0A0, RZ; +-:-:-:-:00 @!P1 MOV load0A1, RZ; +-:-:-:-:00 @!P2 MOV load0A2, RZ; +-:-:-:-:00 @!P3 MOV load0A3, RZ; + +-:-:-:-:00 ISETP.LT.AND P5, PT, tidAY1, k, PT; +-:-:-:-:00 ISETP.LT.AND P0, PT, txa, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +-:-:-:-:00 @P0 LDG.E.CI load1A0, [track1A + 4x<0>]; +-:-:-:-:00 @P1 LDG.E.CI load1A1, [track1A + 4x<1>]; +-:-:-:-:00 @P2 LDG.E.CI load1A2, [track1A + 4x<2>]; +-:-:-:-:00 @P3 LDG.E.CI load1A3, [track1A + 4x<3>]; + +-:-:-:-:00 @!P0 MOV load1A0, RZ; +-:-:-:-:00 @!P1 MOV load1A1, RZ; +-:-:-:-:00 @!P2 MOV load1A2, RZ; +-:-:-:-:00 @!P3 MOV load1A3, RZ; + +-:-:-:-:00 ISETP.LT.AND P6, PT, tidAY2, k, PT; +-:-:-:-:00 ISETP.LT.AND P0, PT, txa, param_m, P6; +-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P6; +-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P6; +-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P6; + +-:-:-:-:00 @P0 LDG.E.CI load2A0, [track2A + 4x<0>]; +-:-:-:-:00 @P1 LDG.E.CI load2A1, [track2A + 4x<1>]; +-:-:-:-:00 @P2 LDG.E.CI load2A2, [track2A + 4x<2>]; +-:-:-:-:00 @P3 LDG.E.CI load2A3, [track2A + 4x<3>]; + +-:-:-:-:00 @!P0 MOV load2A0, RZ; +-:-:-:-:00 @!P1 MOV load2A1, RZ; +-:-:-:-:00 @!P2 MOV load2A2, RZ; +-:-:-:-:00 @!P3 MOV load2A3, RZ; + +-:-:-:-:00 ISETP.LT.AND P5, PT, tidAY3, k, PT; +-:-:-:-:00 ISETP.LT.AND P0, PT, txa, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P5; +-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +-:-:-:-:00 @P0 LDG.E.CI load3A0, [track3A + 4x<0>]; +-:-:-:-:00 @P1 LDG.E.CI load3A1, [track3A + 4x<1>]; +-:-:-:-:00 @P2 LDG.E.CI load3A2, [track3A + 4x<2>]; +-:-:-:-:00 @P3 LDG.E.CI load3A3, [track3A + 4x<3>]; + +-:-:-:-:00 @!P0 MOV load3A0, RZ; +-:-:-:-:00 @!P1 MOV load3A1, RZ; +-:-:-:-:00 @!P2 MOV load3A2, RZ; +-:-:-:-:00 @!P3 MOV load3A3, RZ; + +-:-:-:-:00 IADD txb1, txb, 1; +-:-:-:-:00 IADD txb2, txb, 2; +-:-:-:-:00 IADD txb3, txb, 3; + +-:-:-:-:00 ISETP.LT.AND P4, PT, tidBY, k, PT; +-:-:-:-:00 ISETP.LT.AND P0, PT, txb, param_n, P4; +-:-:-:-:00 ISETP.LT.AND P1, PT, txb1, param_n, P4; +-:-:-:-:00 ISETP.LT.AND P2, PT, txb2, param_n, P4; +-:-:-:-:00 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +-:-:-:-:00 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +-:-:-:-:00 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +-:-:-:-:00 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +-:-:-:-:00 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +-:-:-:-:00 @!P0 MOV loadB0, RZ; +-:-:-:-:00 @!P1 MOV loadB1, RZ; +-:-:-:-:00 @!P2 MOV loadB2, RZ; +-:-:-:-:00 @!P3 MOV loadB3, RZ; + +-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT; +-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +-:-:-:-:00 ISETP.GE.AND P2, PT, k, 32, P5; +-:-:-:-:00 ISETP.GE.AND P3, PT, k, 32, P5; +-:-:-:-:00 ISETP.GE.AND P5, PT, k, 32, P5; +-:-:-:-:00 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +-:-:-:-:00 LOP.AND.NZ P1, RZ, k, 15; + +-:-:-:-:00 STS.128 [writeAs + 4x<0*128>], load0A; +-:-:-:-:00 IADD track0A0.CC, track0A0, lda16; +-:-:-:-:00 IADD.X track0A1, track0A1, RZ; + +-:-:-:-:00 STS.128 [writeAs + 4x<4*128>], load1A; +-:-:-:-:00 IADD track1A0.CC, track1A0, lda16; +-:-:-:-:00 IADD.X track1A1, track1A1, RZ; + +-:-:-:-:00 STS.128 [writeAs + 4x<8*128>], load2A; +-:-:-:-:00 IADD track2A0.CC, track2A0, lda16; +-:-:-:-:00 IADD.X track2A1, track2A1, RZ; + +-:-:-:-:00 STS.128 [writeAs + 4x<12*128>], load3A; +-:-:-:-:00 IADD track3A0.CC, track3A0, lda16; +-:-:-:-:00 IADD.X track3A1, track3A1, RZ; + +-:-:-:-:00 STS.128 [writeBs], loadB; +-:-:-:-:00 IADD trackB0.CC, trackB0, ldb16; + +-:-:-:-:00 ISETP.GT.AND P1, PT, k, 16, P1; + +-:-:-:-:00 IADD readBs, readBs, -swapBuf; +-:-:-:-:00 IADD readAs, readAs, -swapBuf; +-:-:-:-:00 BAR.SYNC 0; +-:-:-:-:00 IADD writeBs, writeBs, swapBuf; +-:-:-:-:00 IADD writeAs, writeAs, swapBuf; +-:-:-:-:00 IADD swapBuf, RZ, -swapBuf; + +-:-:-:-:00 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +-:-:-:-:00 @P5 LDG.E.CI.128 load0A, [track0A]; +-:-:-:-:00 @P5 LDG.E.CI.128 load1A, [track1A]; +-:-:-:-:00 @P5 LDG.E.CI.128 load2A, [track2A]; +-:-:-:-:00 @P5 LDG.E.CI.128 load3A, [track3A]; +-:-:-:-:00 @P6 LDG.E.CI.128 loadB, [trackB]; + } : q{ +-:-:-:-:00 @P5 LDG.E.CI load0A0, [track0A + 4x<0>]; +-:-:-:-:00 @P5 LDG.E.CI load0A1, [track0A + 4x<1>]; +-:-:-:-:00 @P5 LDG.E.CI load0A2, [track0A + 4x<2>]; +-:-:-:-:00 @P5 LDG.E.CI load0A3, [track0A + 4x<3>]; + +-:-:-:-:00 @P5 LDG.E.CI load1A0, [track1A + 4x<0>]; +-:-:-:-:00 @P5 LDG.E.CI load1A1, [track1A + 4x<1>]; +-:-:-:-:00 @P5 LDG.E.CI load1A2, [track1A + 4x<2>]; +-:-:-:-:00 @P5 LDG.E.CI load1A3, [track1A + 4x<3>]; + +-:-:-:-:00 @P5 LDG.E.CI load2A0, [track2A + 4x<0>]; +-:-:-:-:00 @P5 LDG.E.CI load2A1, [track2A + 4x<1>]; +-:-:-:-:00 @P5 LDG.E.CI load2A2, [track2A + 4x<2>]; +-:-:-:-:00 @P5 LDG.E.CI load2A3, [track2A + 4x<3>]; + +-:-:-:-:00 @P5 LDG.E.CI load3A0, [track3A + 4x<0>]; +-:-:-:-:00 @P5 LDG.E.CI load3A1, [track3A + 4x<1>]; +-:-:-:-:00 @P5 LDG.E.CI load3A2, [track3A + 4x<2>]; +-:-:-:-:00 @P5 LDG.E.CI load3A3, [track3A + 4x<3>]; + +-:-:-:-:00 @P6 LDG.E.CI loadB0, [trackB + 4x<0>]; +-:-:-:-:00 @P6 LDG.E.CI loadB1, [trackB + 4x<1>]; +-:-:-:-:00 @P6 LDG.E.CI loadB2, [trackB + 4x<2>]; +-:-:-:-:00 @P6 LDG.E.CI loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 0; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "-:-:-:-:00 IADD k, k, -16;\n", + j0c14 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n", + j5c6 => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n", + j7c6 => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n", + j9c6 => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n", + j11c6 => "-:-:-:-:00 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "-:-:-:-:00 \@P2 IADD track0A0.CC, track0A0, lda16;\n", + j3c13 => "-:-:-:-:00 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "-:-:-:-:00 \@P3 IADD track1A0.CC, track1A0, lda16;\n", + j5c13 => "-:-:-:-:00 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "-:-:-:-:00 \@P5 IADD track2A0.CC, track2A0, lda16;\n", + j7c13 => "-:-:-:-:00 \@P5 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "-:-:-:-:00 \@P5 IADD track3A0.CC, track3A0, lda16;\n", + j9c13 => "-:-:-:-:00 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "-:-:-:-:00 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "-:-:-:-:00 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "-:-:-:-:00 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "-:-:-:-:00 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n" . + "-:-:-:-:00 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "-:-:-:-:00 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "-:-:-:-:00 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j5c29 => "-:-:-:-:00 \@P3 LDG.E.CI.128 load1A, [track1A];\n", + j9c29 => "-:-:-:-:00 \@P5 LDG.E.CI.128 load2A, [track2A];\n", + j9c31 => "-:-:-:-:00 \@P5 LDG.E.CI.128 load3A, [track3A];\n", + j11c29 => "-:-:-:-:00 \@P6 LDG.E.CI.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "-:-:-:-:00 \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n", + j3c31 => "-:-:-:-:00 \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n", + j4c1 => "-:-:-:-:00 \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n", + j4c3 => "-:-:-:-:00 \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n", + + j5c29 => "-:-:-:-:00 \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n", + j5c31 => "-:-:-:-:00 \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n", + j6c1 => "-:-:-:-:00 \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n", + j6c3 => "-:-:-:-:00 \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n", + + j9c29 => "-:-:-:-:00 \@P5 LDG.E.CI load2A0, [track2A + 4x<0>];\n", + j9c31 => "-:-:-:-:00 \@P5 LDG.E.CI load2A1, [track2A + 4x<1>];\n", + j10c1 => "-:-:-:-:00 \@P5 LDG.E.CI load2A2, [track2A + 4x<2>];\n", + j10c3 => "-:-:-:-:00 \@P5 LDG.E.CI load2A3, [track2A + 4x<3>];\n", + + j10c8 => "-:-:-:-:00 \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n", + j10c10 => "-:-:-:-:00 \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n", + j10c12 => "-:-:-:-:00 \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n", + j10c14 => "-:-:-:-:00 \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n", + + j11c29 => "-:-:-:-:00 \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j11c31 => "-:-:-:-:00 \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j12c1 => "-:-:-:-:00 \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j12c3 => "-:-:-:-:00 \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n" . + "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n" + ); + return ; + + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_common_128x128.sass b/Kernel/SGEMM/Maxwell/hgemm_common_128x128.sass new file mode 100644 index 0000000..d699483 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_common_128x128.sass @@ -0,0 +1,412 @@ +# hgemm_common_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +[- + +our $int16; + +sub convert_in { + return $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; +} + + +sub convert_out { + return $int16 ? 'F2I.S16.F32': 'F2F.F16.F32'; +} + + +sub scale_int16 { + return $int16? q{ +--:-:-:-:1 FMUL c0, c0, param_scale; +--:-:-:-:1 FMUL c1, c1, param_scale; +--:-:-:-:1 FMUL c2, c2, param_scale; +--:-:-:-:0 FMUL c3, c3, param_scale; + } : ""; +} + + +sub max_abs1 { + return $int16? q{ +--:-:-:-:1 @!P0 MOV cs0, RZ; +--:-:-:-:1 @!P1 MOV cs1, RZ; +--:-:-:-:1 @!P2 MOV cs2, RZ; +--:-:-:-:1 @!P3 MOV cs3, RZ; + +--:-:-:-:1 @P0 VABSDIFF.S16.S16.MRG_16L cs0, c0, RZ, RZ; +--:-:-:-:1 @P1 VABSDIFF.S16.S16.MRG_16L cs1, c1, RZ, RZ; +--:-:-:-:1 @P2 VABSDIFF.S16.S16.MRG_16L cs2, c2, RZ, RZ; +--:-:-:-:1 @P3 VABSDIFF.S16.S16.MRG_16L cs3, c3, RZ, RZ; + } : ""; +} + + +sub max_abs2 { + return $int16? q{ + + +// a = abs(a) +--:-:-:-:1 @P0 VABSDIFF.S16.S16.MRG_16H cs0, c0, RZ, cs0; +--:-:-:-:1 @P1 VABSDIFF.S16.S16.MRG_16H cs1, c1, RZ, cs1; +--:-:-:-:1 @P2 VABSDIFF.S16.S16.MRG_16H cs2, c2, RZ, cs2; +--:-:-:-:1 @P3 VABSDIFF.S16.S16.MRG_16H cs3, c3, RZ, cs3; + +// max = max(c,d,max(a,b,max)) ... +--:-:-:-:1 VMNMX.UD.U16.U16.MX.MAX maxabs, cs0, cs0.H1, maxabs; +--:-:-:-:1 VMNMX.UD.U16.U16.MX.MAX maxabs, cs1, cs1.H1, maxabs; +--:-:-:-:1 VMNMX.UD.U16.U16.MX.MAX maxabs, cs2, cs2.H1, maxabs; +--:-:-:-:1 VMNMX.UD.U16.U16.MX.MAX maxabs, cs3, cs3.H1, maxabs; + + + } : ""; +} + + +sub butterfly { + return $int16 ? q{ +--:-:-:-:0 LOP.AND.Z P0, RZ, tid, 31; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x10, 0x1f; +01:-:-:-:4 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x8, 0x1f; +01:-:-:-:4 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x4, 0x1f; +01:-:-:-:4 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:-:-:0 MOV Stats0, param_Stats[0]; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x2, 0x1f; +01:-:-:-:4 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:-:-:0 MOV Stats1, param_Stats[1]; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x1, 0x1f; +01:-:-:-:2 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:-:-:1 @P0 RED.E.MAX [Stats], maxabs; + } : ""; +} + +-] + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>]; + +LOOP: + +[+ + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $yield = $c == 32 ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + ++] + +--:-:-:-:1 IADD loop, loop, 1; +--:-:-:-:1 IADD ta, ta, param_ldaz; +--:-:-:-:1 IADD tb, tb, param_ldbz; +--:-:-:-:3 MOV k, param_k; +--:-:-:-:1 ISETP.LT.AND P1, PT, loop, param_loops, PT; +--:-:-:-:6 LEA trackA0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 1; +--:-:-:-:6 LEA trackB0.CC, tb, param_B[0], 1; +--:-:-:-:0 LEA.HI.X trackB1, tb, param_B[1], RZ, 1; +--:-:-:Y:5 @P1 BRA.U REMAINDER; + + + +// writeCs = (readAs / 4) * 128 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0xfff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xfff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 5; + +--:-:-:-:1 LOP.AND tid_31, tid, 31; +--:-:-:-:1 LOP.AND tid_96, tid, 96; +--:-:-:-:1 LOP.AND tid_128, tid, 128; + +// cx = tid31 | (tid_128 >> 2); +--:-:-:-:1 SHR.U32 cx00, tid_128, 2; +--:-:-:-:1 LOP.OR cx00, tid_31, cx00; + +// readCs = ((tid_96 << 4) | cx) << 2; +--:-:-:-:1 SHL readCs, tid_96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, cx00; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx += blkB*128; +--:-:-:-:1 ISCADD cx00, blkB, cx00, 7; +--:-:-:-:1 IADD cx64, cx00, 64; + +// cy = blkA*128 + (tid_96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid_96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx00, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 1; + +--:-:-:-:1 SHL ldc1, ldc, 1; +--:-:-:-:1 SHL ldc4, ldc, 3; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 7; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; +--:-:-:-:1 MOV maxabs, RZ; + +--:-:-:-:1 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +[+ butterfly() +] +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.S16 d0, [C00y0 + 2x<00>]; +--:-:2:-:1 @P1 LDG.E.S16 d1, [C00y0 + 2x<64>]; +--:-:3:-:1 @P2 LDG.E.S16 d2, [C04y0 + 2x<00>]; +--:-:4:-:1 @P3 LDG.E.S16 d3, [C04y0 + 2x<64>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c4, c4, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c5, c5, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c6, c6, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c7, c7, RZ, !PT; + +--:-:-:-:5 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + + +--:-:-:-:3 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], c4; + +--:-:-:-:0 IADD cy00, cy00, 1; + +--:-:-:-:1 LDS c0, [readCs + 4x<0*128 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<0*128 + 64>]; +--:-:-:-:1 LDS c2, [readCs + 4x<1*128 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<1*128 + 64>]; + +--:-:-:-:0 IADD cy04, cy04, 1; + +01:-:1:-:1 @P6 [+ convert_in() +] d0, d0; +02:-:2:-:1 @P6 [+ convert_in() +] d1, d1; +04:-:3:-:1 @P6 [+ convert_in() +] d2, d2; +08:-:4:-:1 @P6 [+ convert_in() +] d3, d3; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:3 @P6 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +01:-:-:-:1 @P0 STG.E.S16 [C00y0 + 2x<00>], c0; +02:5:-:-:1 @P1 STG.E.S16 [C00y0 + 2x<64>], c1; +04:-:-:-:1 @P2 STG.E.S16 [C04y0 + 2x<00>], c2; +08:6:-:-:1 @P3 STG.E.S16 [C04y0 + 2x<64>], c3; + +[+ max_abs1() +] + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.S16 d0, [C08y0 + 2x<00>]; +--:-:2:-:1 @P1 LDG.E.S16 d1, [C08y0 + 2x<64>]; +--:-:3:-:1 @P2 LDG.E.S16 d2, [C12y0 + 2x<00>]; +--:-:4:-:1 @P3 LDG.E.S16 d3, [C12y0 + 2x<64>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +--:-:-:-:2 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:2 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + + +10:-:-:-:4 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +20:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:0 IADD.X C04y1, C04y1, RZ; + +--:-:-:-:1 LDS c0, [readCs + 4x<2*128 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<2*128 + 64>]; +--:-:-:-:1 LDS c2, [readCs + 4x<3*128 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*128 + 64>]; + +01:-:1:-:4 @P6 [+ convert_in() +] d0, d0; +02:-:2:-:4 @P6 [+ convert_in() +] d1, d1; +04:-:3:-:4 @P6 [+ convert_in() +] d2, d2; +08:-:4:-:1 @P6 [+ convert_in() +] d3, d3; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:3 @P6 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +01:-:-:-:1 @P0 STG.E.S16 [C08y0 + 2x<00>], c0; +02:5:-:-:1 @P1 STG.E.S16 [C08y0 + 2x<64>], c1; +04:-:-:-:1 @P2 STG.E.S16 [C12y0 + 2x<00>], c2; +08:6:-:-:1 @P3 STG.E.S16 [C12y0 + 2x<64>], c3; + +[+ max_abs2() +] + +10:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +20:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_common_128x32.sass b/Kernel/SGEMM/Maxwell/hgemm_common_128x32.sass new file mode 100644 index 0000000..9d4860a --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_common_128x32.sass @@ -0,0 +1,246 @@ +# hgemm_common_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 32 + readBs; +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 2; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +--:-:-:-:1 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 1; + +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; + +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P4, RZ, flags, 2; + +--:-:-:-:1 SHL ldc1, ldc, 1; +--:-:-:-:1 SHL ldc4, ldc, 3; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 7; + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.U16 d0, [C00y]; +--:-:2:-:1 @P1 LDG.E.U16 d1, [C04y]; +--:-:3:-:1 @P2 LDG.E.U16 d2, [C08y]; +--:-:4:-:1 @P3 LDG.E.U16 d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:0 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*32>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*32>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*32>]; + + +01:-:1:-:1 @P5 F2F.F32.F16 d0, d0; +02:-:2:-:1 @P5 F2F.F32.F16 d1, d1; +04:-:3:-:1 @P5 F2F.F32.F16 d2, d2; +08:-:4:-:1 @P5 F2F.F32.F16 d3, d3; + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P5 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +01:1:-:-:1 @P0 STG.E.CG.U16 [C00y], c0; +02:2:-:-:1 @P1 STG.E.CG.U16 [C04y], c1; +04:3:-:-:1 @P2 STG.E.CG.U16 [C08y], c2; +08:4:-:-:1 @P3 STG.E.CG.U16 [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_common_128x64.sass b/Kernel/SGEMM/Maxwell/hgemm_common_128x64.sass new file mode 100644 index 0000000..a375c03 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_common_128x64.sass @@ -0,0 +1,318 @@ +# hgemm_common_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $yield = $c == 32 ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:1:-:1 S2R threadId, SR_TID.X; +--:-:2:-:1 S2R blockA, SR_CTAID.Y; +--:-:3:-:1 S2R blockB, SR_CTAID.Z; +--:-:4:-:1 S2R blockZ, SR_CTAID.X; + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 64 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0xff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 4; + +// readCs = ((threadId & 96) << 3) | (threadId & 31) << 2; +01:-:-:-:1 LOP.AND tid31, threadId, 31; +01:-:-:-:1 LOP.AND tid96, threadId, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx00 = blkB*64 + tid31; +04:-:-:-:1 ISCADD cx00, blockB, tid31, 6; +--:-:-:-:1 IADD cx32, cx00, 32; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +02:-:-:-:1 ISCADD cy00, blockA, cy00, 7; + +// C += (cy*ldc + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx00, xmad_c; +08:-:-:-:1 XMAD.LO2 ci, ldcz, blockZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 1; + +--:-:-:-:1 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + +--:-:-:-:1 SHL ldc1, ldc, 1; +--:-:-:-:1 SHL ldc4, ldc, 3; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 7; + + + +//--:-:1:-:2 I2F.F32.U32 temp, threadId; +//01:-:-:-:1 F2F.F16.F32 temp, temp; + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.S16 d0, [C00y0 + 2x<00>]; +--:-:2:-:1 @P1 LDG.E.S16 d1, [C00y0 + 2x<32>]; +--:-:3:-:1 @P2 LDG.E.S16 d2, [C04y0 + 2x<00>]; +--:-:4:-:1 @P3 LDG.E.S16 d3, [C04y0 + 2x<32>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c4, c4, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c5, c5, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c6, c6, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c7, c7, RZ, !PT; + +--:-:-:-:5 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + + +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], c4; + +--:-:-:-:0 IADD cy00, cy00, 1; + +--:-:-:-:1 LDS c0, [readCs + 4x<0*64 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<1*64 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<1*64 + 32>]; + +--:-:-:-:0 IADD cy04, cy04, 1; + +01:-:1:-:1 @P6 F2F.F32.F16 d0, d0; +02:-:2:-:1 @P6 F2F.F32.F16 d1, d1; +04:-:3:-:1 @P6 F2F.F32.F16 d2, d2; +08:-:4:-:1 @P6 F2F.F32.F16 d3, d3; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P6 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, P6; + +// Stochastic Round flag +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 1; + +01:-:-:-:1 @P0 STG.E.S16 [C00y0 + 2x<00>], c0; +02:5:-:-:1 @P1 STG.E.S16 [C00y0 + 2x<32>], c1; +04:-:-:-:1 @P2 STG.E.S16 [C04y0 + 2x<00>], c2; +08:6:-:-:1 @P3 STG.E.S16 [C04y0 + 2x<32>], c3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.S16 d0, [C08y0 + 2x<00>]; +--:-:2:-:1 @P1 LDG.E.S16 d1, [C08y0 + 2x<32>]; +--:-:3:-:1 @P2 LDG.E.S16 d2, [C12y0 + 2x<00>]; +--:-:4:-:1 @P3 LDG.E.S16 d3, [C12y0 + 2x<32>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, PT; + +--:-:-:-:2 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:2 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + + +10:-:-:-:4 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +20:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:0 IADD.X C04y1, C04y1, RZ; + +--:-:-:-:1 LDS c0, [readCs + 4x<2*64 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<3*64 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*64 + 32>]; + +01:-:1:-:1 @P6 F2F.F32.F16 d0, d0; +02:-:2:-:1 @P6 F2F.F32.F16 d1, d1; +04:-:3:-:1 @P6 F2F.F32.F16 d2, d2; +08:-:4:-:1 @P6 F2F.F32.F16 d3, d3; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P6 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +01:-:-:-:1 @P0 STG.E.S16 [C08y0 + 2x<00>], c0; +02:5:-:-:1 @P1 STG.E.S16 [C08y0 + 2x<32>], c1; +04:-:-:-:1 @P2 STG.E.S16 [C12y0 + 2x<00>], c2; +08:6:-:-:1 @P3 STG.E.S16 [C12y0 + 2x<32>], c3; + +10:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +20:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_common_32x128.sass b/Kernel/SGEMM/Maxwell/hgemm_common_32x128.sass new file mode 100644 index 0000000..3661b08 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_common_32x128.sass @@ -0,0 +1,244 @@ +# Kernel: hgemm_common_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*32 + 16 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 128 + readBs; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 5; + +// readCs = tid * 4; +--:-:-:-:1 SHL readCs, tid, 2; + +// cx = blkB*128 + tid; +--:-:-:-:1 ISCADD cx, blkB, tid, 7; + +// cy = blkA*32 +--:-:-:-:1 SHL cy00, blkA, 5; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; + +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 1; + +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; + +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P4, RZ, flags, 2; + +--:-:-:-:1 SHL ldc1, ldc, 1; +--:-:-:-:1 SHL ldc4, ldc, 3; +--:-:-:-:1 ISCADD ldc12, ldc, -ldc4, 5; + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc12;\n" . + "--:-:-:-:1 IADD cy00, cy00, 12;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc12;\n" . + "--:-:-:-:1 IADD cy04, cy04, 12;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc12;\n" . + "--:-:-:-:1 IADD cy08, cy08, 12;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc12;\n" . + "--:-:-:-:1 IADD cy12, cy12, 12;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.U16 d0, [C00y]; +--:-:2:-:1 @P1 LDG.E.U16 d1, [C04y]; +--:-:3:-:1 @P2 LDG.E.U16 d2, [C08y]; +--:-:4:-:1 @P3 LDG.E.U16 d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:0 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*128>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*128>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*128>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*128>]; + + +01:-:1:-:1 @P5 F2F.F32.F16 d0, d0; +02:-:2:-:1 @P5 F2F.F32.F16 d1, d1; +04:-:3:-:1 @P5 F2F.F32.F16 d2, d2; +08:-:4:-:1 @P5 F2F.F32.F16 d3, d3; + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P5 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +01:1:-:-:1 @P0 STG.E.CG.U16 [C00y], c0; +02:2:-:-:1 @P1 STG.E.CG.U16 [C04y], c1; +04:3:-:-:1 @P2 STG.E.CG.U16 [C08y], c2; +08:4:-:-:1 @P3 STG.E.CG.U16 [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_128x128.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_128x128.sass new file mode 100644 index 0000000..0b4f460 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nn_128x128.sass @@ -0,0 +1,393 @@ +# Kernel: hgemm_nn_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $int16; +our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; +sub convert_in {return $convert;} + +sub int16_params { + return $int16 ? q{ +param_Stats[0] : c[0x0][0x190] +param_Stats[1] : c[0x0][0x194] +param_scale : c[0x0][0x198] + } : ""; +} +-] + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + [+ int16_params() +] + + + + + 64-95 ~ tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, tid31, tid128, txa, xmad_ta, xmad_tb, k<1-3>, x<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-105 : loadB<0-3>, loadA<0-5> + + 106-109 : trackA<0-1>, trackB<0-1> + + 110-118 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop + 119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ + + 64-75 ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128 + + 64-79 : c<0-7>, d3, d2, d1, d0, cs<0-3> + 64-65 : Stats<0-1> + 80-89 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 90-118 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs + + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 MOV loop, RZ; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); ++] + +01:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid128, tid, 128; + +// tidAY = (tid & 1) << 2 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; + +// tidAX = tid >> 1 +--:-:-:-:1 SHR.U32 tidAX, tid, 1; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) & 7 +--:-:-:-:1 SHL tidBX, tid31, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x305; // 3 bits at position 5 + +// trackB += (blkB*128 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 7; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeAs = 4 * (128 * tidAY + tidAX) +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<128*8*2>, 2; + + +// writeBs = (128*tidBY + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 7; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<128*8*3>, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + +[+ + our $vec; + return $vec ? q{ +--:-:-:-:2 ISETP.LT.AND P3, PT, tidBY, k, P6; +--:-:-:Y:b ISETP.LT.AND P2, PT, tidAY, k, P5; + +--:-:4:-:2 @P3 LDG.E.CI.64 loadB0, [trackB]; +--:-:2:-:1 @P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>]; +--:-:2:-:1 @P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>]; + +--:-:-:-:0 PSETP.AND.AND P4, PT, PT, PT, PT; + +--:-:5:-:1 @!P3 LDS.U.64 loadB0, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.64 loadA0, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.64 loadA4, [addr_zero]; + } : q{ + + +// doLoad0 = tidBY < k +--:-:-:-:1 IADD x1, txb, 1; +--:-:-:-:1 IADD x2, txb, 2; +--:-:-:-:1 IADD x3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_n, P0; + +--:-:4:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>]; +--:-:4:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>]; +--:-:4:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>]; +--:-:4:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + +--:-:-:-:1 IADD k1, tidAY, 1; +--:-:-:-:1 IADD k2, tidAY, 2; +--:-:-:-:1 IADD k3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + + }; ++] + +[+ + our $vec; + our $convert; + return $vec ? qq{ +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 7; + +18:-:-:-:4 $convert loadB3, loadB1.H1; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 $convert loadB2, loadB1.H0; +--:-:-:-:4 $convert loadB1, loadB0.H1; +--:-:4:-:2 $convert loadB0, loadB0.H0; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +08:-:-:-:1 STS.128 [writeBs], loadB0; + +22:-:-:-:4 $convert loadA3, loadA1.H1; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<16>; +--:-:2:-:4 $convert loadA2, loadA1.H0; +--:-:-:-:4 $convert loadA1, loadA0.H1; +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; +--:-:3:-:1 $convert loadA0, loadA0.H0; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<3*128>], loadA3; +--:-:-:-:1 STS [writeAs + 4x<2*128>], loadA2; +04:-:-:-:1 STS [writeAs + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<0*128>], loadA0; + } : qq{ +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +08:-:-:-:4 $convert loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 $convert loadB1, loadB1; +--:-:-:-:4 $convert loadB2, loadB2; +--:-:4:-:2 $convert loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +08:-:-:-:1 STS.128 [writeBs], loadB0; + +02:-:-:-:4 $convert loadA0, loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<8>; +--:-:2:-:4 $convert loadA1, loadA1; +--:-:-:-:4 $convert loadA2, loadA2; +--:-:3:-:1 $convert loadA3, loadA3; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeAs + 4x<1*128>], loadA1; +04:-:-:-:1 STS [writeAs + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*128>], loadA3; + }; ++] + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 LOP.XOR writeAs, writeAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR writeBs, writeBs, 4x<128*8*2>; + + + +[+ + our $vec; + our $convert; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P4, PT, !P4, PT, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c15 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P4, P5;\n", + + j0c10 => "--:-:2:-:1 \@P3 LDG.E.CI.64 loadB0, [trackB];\n", + + j0c28 => "--:-:5:-:1 \@P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];\n", + j0c30 => "20:4:6:-:1 \@P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];\n", + + j4c5 => "--:-:-:-:1 \@!P4 $convert loadA3, loadA5.H1;\n", + j4c9 => "--:-:-:-:1 \@!P4 $convert loadA2, loadA5.H0;\n", + j4c13 => "--:-:-:-:1 \@!P4 $convert loadA1, loadA4.H1;\n", + j4c17 => "--:-:-:-:1 \@!P4 $convert loadA0, loadA4.H0;\n", + + j5c5 => "02:-:-:-:1 \@P0 $convert loadB3, loadB1.H1;\n", + j5c9 => "--:-:-:-:1 \@P0 $convert loadB2, loadB1.H0;\n", + j5c13 => "--:-:-:-:1 \@P0 $convert loadB1, loadB0.H1;\n", + j5c17 => "--:-:2:-:1 \@P0 $convert loadB0, loadB0.H0;\n", + + j5c35 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c5 => "10:-:2:-:1 \@P4 $convert loadA3, loadA1.H1;\n", + j6c9 => "--:-:3:-:1 \@P4 $convert loadA2, loadA1.H0;\n", + j6c13 => "--:-:4:-:1 \@P4 $convert loadA1, loadA0.H1;\n", + j6c17 => "--:-:5:-:1 \@P4 $convert loadA0, loadA0.H0;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n", + j6c31 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n", + j6c33 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n", + j6c35 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n", + + j6c11 => "08:-:-:-:1 \@P4 IADD trackA0.CC, trackA0, 2x<16>;\n", + j6c54 => "--:-:-:-:1 \@P4 IADD.X trackA1, trackA1, RZ;\n", + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + j0c10 => "--:-:2:-:1 \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j0c12 => "--:-:2:-:1 \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j0c14 => "--:-:2:-:1 \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j0c16 => "--:-:2:-:1 \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j0c29 => "--:-:6:-:1 \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n", + j0c31 => "--:-:6:-:1 \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n", + j0c33 => "--:-:6:-:1 \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n", + j0c35 => "--:-:6:-:1 \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n", + + j5c8 => "02:-:-:-:1 \@P3 $convert loadB0, loadB0;\n", + j5c12 => "--:-:-:-:1 \@P3 $convert loadB1, loadB1;\n", + j5c16 => "--:-:-:-:1 \@P3 $convert loadB2, loadB2;\n", + j5c20 => "--:-:2:-:1 \@P3 $convert loadB3, loadB3;\n", + + j5c39 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c5 => "20:-:2:-:1 \@P2 $convert loadA0, loadA0;\n", + j6c9 => "--:-:3:-:1 \@P2 $convert loadA1, loadA1;\n", + j6c13 => "--:-:4:-:1 \@P2 $convert loadA2, loadA2;\n", + j6c17 => "--:-:5:-:1 \@P2 $convert loadA3, loadA3;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n", + j6c31 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n", + j6c33 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n", + j6c35 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n", + + j6c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 2x<8>;\n", + j6c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + ) + ), + + j5c46 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:0 IADD32I k, k, -8;\n" . + "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; ++] + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_128x32.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_128x32.sass new file mode 100644 index 0000000..33a4a9a --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nn_128x32.sass @@ -0,0 +1,590 @@ +# Kernel: hgemm_nn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + 32*16*2> + szShareA : (128*16 + 32) + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, shiftAX + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb16, ldb, 5; +--:-:-:-:1 SHL lda32, lda, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa00, blkA, tidAX, 7; +--:-:-:-:1 IADD txa32, txa00, 32; +--:-:-:-:1 IADD txa64, txa00, 64; +--:-:-:-:1 IADD txa96, txa00, 96; + +--:-:-:-:1 XMAD.LO ta00, lda, txa00, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta00, ldaz, blkZ, ta00; +--:-:-:-:1 IADD ta32, ta00, lda32; +--:-:-:-:1 IADD ta64, ta32, lda32; +--:-:-:-:1 IADD ta96, ta64, lda32; + +--:-:-:-:1 LEA track0A0.CC, ta00, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta00, param_A[1], RZ, 1; +--:-:-:-:1 LEA track1A0.CC, ta32, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track1A1, ta32, param_A[1], RZ, 1; +--:-:-:-:1 LEA track2A0.CC, ta64, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track2A1, ta64, param_A[1], RZ, 1; +--:-:-:-:1 LEA track3A0.CC, ta96, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track3A1, ta96, param_A[1], RZ, 1; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 1; + +// writeAs = (tidAY*128 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.CI.64 load0A, [track0A]; +--:-:2:-:1 @P3 LDG.E.CI.64 load1A, [track1A]; +--:-:3:-:1 @P4 LDG.E.CI.64 load2A, [track2A]; +--:-:4:-:1 @P5 LDG.E.CI.64 load3A, [track3A]; +--:-:5:-:1 @P6 LDG.E.CI.64 loadB, [trackB]; + + + +--:-:6:-:1 @!P2 LDS.U.64 load0A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 load1A, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.64 load2A, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.64 load3A, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.64 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:1:-:1 @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:1:-:1 @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:1:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:1:-:1 @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI.U16 load2A0, [track2A + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.U16 load2A1, [track2A + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load2A2, [track2A + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load2A3, [track2A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI.U16 load3A0, [track3A + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.U16 load3A1, [track3A + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.U16 load3A2, [track3A + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load3A3, [track3A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:5:-:1 @P0 LDG.E.CI.U16 loadB0, [trackB + 2x<0>]; +--:-:5:-:1 @P1 LDG.E.CI.U16 loadB1, [trackB + 2x<1>]; +--:-:5:-:1 @P2 LDG.E.CI.U16 loadB2, [trackB + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 loadB3, [trackB + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P1, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + + + + + our $vec; + return $vec ? q{ +21:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:1:-:1 F2F.F32.F16 load0A0, load0A0.H0; + +02:-:-:-:1 F2F.F32.F16 load1A3, load1A1.H1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A1.H0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A0.H1; +--:-:2:-:1 F2F.F32.F16 load1A0, load1A0.H0; + +04:-:-:-:1 F2F.F32.F16 load2A3, load2A1.H1; +--:-:-:-:1 F2F.F32.F16 load2A2, load2A1.H0; +--:-:-:-:1 F2F.F32.F16 load2A1, load2A0.H1; +--:-:3:-:1 F2F.F32.F16 load2A0, load2A0.H0; + +08:-:-:-:1 F2F.F32.F16 load3A3, load3A1.H1; +--:-:-:-:1 F2F.F32.F16 load3A2, load3A1.H0; +--:-:-:-:1 F2F.F32.F16 load3A1, load3A0.H1; +--:-:4:-:1 F2F.F32.F16 load3A0, load3A0.H0; + +10:-:-:-:1 F2F.F32.F16 loadB3, loadB1.H1; +--:-:-:-:1 F2F.F32.F16 loadB2, loadB1.H0; +--:-:-:-:1 F2F.F32.F16 loadB1, loadB0.H1; +--:-:5:-:1 F2F.F32.F16 loadB0, loadB0.H0; + } : q{ +21:-:-:-:1 F2F.F32.F16 load0A0, load0A0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:1:-:1 F2F.F32.F16 load0A3, load0A3; + +02:-:-:-:1 F2F.F32.F16 load1A0, load1A0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A2; +--:-:2:-:1 F2F.F32.F16 load1A3, load1A3; + +04:-:-:-:1 F2F.F32.F16 load2A0, load2A0; +--:-:-:-:1 F2F.F32.F16 load2A1, load2A1; +--:-:-:-:1 F2F.F32.F16 load2A2, load2A2; +--:-:3:-:1 F2F.F32.F16 load2A3, load2A3; + +08:-:-:-:1 F2F.F32.F16 load3A0, load3A0; +--:-:-:-:1 F2F.F32.F16 load3A1, load3A1; +--:-:-:-:1 F2F.F32.F16 load3A2, load3A2; +--:-:4:-:1 F2F.F32.F16 load3A3, load3A3; + +10:-:-:-:1 F2F.F32.F16 loadB0, loadB0; +--:-:-:-:1 F2F.F32.F16 loadB1, loadB1; +--:-:-:-:1 F2F.F32.F16 loadB2, loadB2; +--:-:5:-:1 F2F.F32.F16 loadB3, loadB3; + }; + + +01:-:-:-:1 STS [writeAs + 4x<0*128 + 0*32>], load0A0; +--:-:-:-:0 IADD track0A0.CC, track0A0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 0*32>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 0*32>], load0A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 0*32>], load0A3; + +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128 + 1*32>], load1A0; +--:-:-:-:0 IADD track1A0.CC, track1A0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 1*32>], load1A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 1*32>], load1A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 1*32>], load1A3; + +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 2*32>], load2A0; +--:-:-:-:0 IADD track2A0.CC, track2A0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 2*32>], load2A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 2*32>], load2A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 2*32>], load2A3; + +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 3*32>], load3A0; +--:-:-:-:0 IADD track3A0.CC, track3A0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 3*32>], load3A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 3*32>], load3A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 3*32>], load3A3; + +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.64 load0A, [track0A]; +--:-:4:-:1 @P3 LDG.E.CI.64 load1A, [track1A]; +--:-:5:-:1 @P4 LDG.E.CI.64 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.CI.64 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.64 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:5:-:1 @P4 LDG.E.CI.U16 load2A0, [track2A + 2x<0>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2A1, [track2A + 2x<1>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2A2, [track2A + 2x<2>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2A3, [track2A + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n", + + j9c6 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n", + + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 2x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2A0.CC, track2A0, 2x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, 2x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.64 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI.64 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.64 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.64 loadB, [trackB];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0A3, load0A1.H1;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A2, load0A1.H0;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A1, load0A0.H1;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1A3, load1A1.H1;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A2, load1A1.H0;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A1, load1A0.H1;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1A0, load1A0.H0;\n", + + j6c13 => "10:-:-:-:1 \@P4 F2F.F32.F16 load2A3, load2A1.H1;\n", + j6c17 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2A2, load2A1.H0;\n", + j6c21 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2A1, load2A0.H1;\n", + j6c25 => "--:-:5:-:1 \@P4 F2F.F32.F16 load2A0, load2A0.H0;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A3, load3A1.H1;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A2, load3A1.H0;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A1, load3A0.H1;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3A0, load3A0.H0;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];\n", + j9c31 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];\n", + j10c1 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];\n", + j10c3 => "--:-:5:-:1 \@P4 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0A0, load0A0;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A1, load0A1;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A2, load0A2;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0A3, load0A3;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1A0, load1A0;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A1, load1A1;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A2, load1A2;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1A3, load1A3;\n", + + j6c13 => "10:-:-:-:1 \@P4 F2F.F32.F16 load2A0, load2A0;\n", + j6c17 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2A1, load2A1;\n", + j6c21 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2A2, load2A2;\n", + j6c25 => "--:-:5:-:1 \@P4 F2F.F32.F16 load2A3, load2A3;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A0, load3A0;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A1, load3A1;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A2, load3A2;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3A3, load3A3;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB3;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB2;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB0, loadB0;\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_128x64.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_128x64.sass new file mode 100644 index 0000000..8e6c457 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nn_128x64.sass @@ -0,0 +1,438 @@ +# Kernel: hgemm_nn_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 64*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ tid, blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, txa, txa1, ta, xmad_ta, tb, xmad_tb, tidAY, tidBY, k<1-3>, x<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-111 : loadA<0-7>, loadAA<0-3>, loadB<0-3> + + 112-117 : track0A<0-1>, track1A<0-1>, trackB<0-1> + + 118-122 ~ writeAs, writeBs, k, txb, swapBuf + 123-127 : readAs, readBs + + 64-83 ~ ldc, ldcz, ci, xmad_c, threadId, tid31, tid96, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C04y<0-1>, C08y<0-1>, C12y<0-1>, C00y<0-1> + 86-107 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +// tidAX = tid & 0xfe +// tidAY = (tid & 1) << 2 +01:-:-:-:1 LOP.AND tidAX, tid, 0xfe; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 1; +--:-:-:-:1 LEA track1A0.CC, lda, track0A0, 1; +--:-:-:-:1 LEA.HI.X track1A1, lda, track0A1, RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa, param_m, PT; +--:-:-:-:1 IADD txa1, txa, 1; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa1, param_m, PT; + +// tidBX = (tid & 15) << 2 +// tidBY = (tid >> 4) & 7 +--:-:-:-:1 LOP.AND tidBX, tid, 15; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4 + +// trackB += (blkB*64 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 1; +--:-:-:-:2 LEA.HI.X trackB1, tb, param_B[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// Start the write buffers high +// writeAs = (128*tidAY + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2; +// writeBs = (64*tidBY + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2; + +// Start the read buffers low +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x<64*8 + 128*8>; + + +REMAINDER: + + + our $vec; + return $vec ? q{ +--:-:6:-:1 @P6 LDG.E.CI.64 loadB0, [trackB]; + +--:-:2:-:1 @P5 LDG.E.CI.64 loadA2, [track1A + 2x<0>]; +--:-:2:-:1 @P5 LDG.E.CI.64 loadAA2, [track1A + 2x<8>]; + +--:-:3:-:1 @P4 LDG.E.CI.64 loadA0, [track0A + 2x<0>]; +--:-:3:-:1 @P4 LDG.E.CI.64 loadAA0, [track0A + 2x<8>]; + +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +--:-:4:-:1 @!P6 LDS.U.64 loadB0, [addr_zero]; +--:-:5:-:1 @!P5 LDS.U.64 loadA2, [addr_zero]; +--:-:5:-:1 @!P4 LDS.U.64 loadA0, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.64 loadAA2, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.64 loadAA0, [addr_zero]; + } : q{ + +--:-:2:-:2 S2R tid, SR_TID.X; + + +02:-:-:-:1 LOP.AND tidAY, tid, 1; +--:-:-:-:1 SHL tidAY, tidAY, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4 + +// doLoad0 = tidBY < k +--:-:-:-:1 IADD x1, txb, 1; +--:-:-:-:1 IADD x2, txb, 2; +--:-:-:-:1 IADD x3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_n, P0; + +--:-:6:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<0>]; +--:-:6:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<1>]; +--:-:6:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<2>]; +--:-:6:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + +--:-:-:-:1 IADD k1, tidAY, 1; +--:-:-:-:1 IADD k2, tidAY, 2; +--:-:-:-:1 IADD k3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P4; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [track0A + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA2, [track0A + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA4, [track0A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA6, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA2, RZ; +--:-:-:-:1 @!P2 MOV loadA4, RZ; +--:-:-:-:1 @!P3 MOV loadA6, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:3:-:1 @P0 LDG.E.CI.S16 loadA1, [track1A + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.S16 loadA3, [track1A + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.S16 loadA5, [track1A + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.S16 loadA7, [track1A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA1, RZ; +--:-:-:-:1 @!P1 MOV loadA3, RZ; +--:-:-:-:1 @!P2 MOV loadA5, RZ; +--:-:-:-:1 @!P3 MOV loadA7, RZ; + + }; + + + + our $vec; + return $vec ? q{ +28:-:-:-:4 F2F.F32.F16 loadB3, loadB1.H1; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 F2F.F32.F16 loadB2, loadB1.H0; +--:-:-:-:4 F2F.F32.F16 loadB1, loadB0.H1; +--:-:4:-:2 F2F.F32.F16 loadB0, loadB0.H0; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +08:-:-:-:1 STS.128 [writeBs], loadB0; + +12:-:-:-:4 F2F.F32.F16 loadA7, loadA3.H1; +04:-:2:-:4 F2F.F32.F16 loadA6, loadA1.H1; +--:-:-:-:0 IADD track0A0.CC, track0A0, 2x<16>; +--:-:-:-:4 F2F.F32.F16 loadA5, loadA3.H0; +--:-:3:-:4 F2F.F32.F16 loadA4, loadA1.H0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; +--:-:-:-:4 F2F.F32.F16 loadA3, loadA2.H1; +--:-:-:-:0 IADD track1A0.CC, track1A0, 2x<16>; +--:-:-:-:4 F2F.F32.F16 loadA1, loadA2.H0; +--:-:4:-:4 F2F.F32.F16 loadA2, loadA0.H1; +--:-:5:-:1 F2F.F32.F16 loadA0, loadA0.H0; + +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +02:-:-:-:1 STS.64 [writeAs + 4x<3*128>], loadA6; +04:-:-:-:1 STS.64 [writeAs + 4x<2*128>], loadA4; +08:-:-:-:1 STS.64 [writeAs + 4x<1*128>], loadA2; +10:-:-:-:1 STS.64 [writeAs + 4x<0*128>], loadA0; + + } : q{ + +20:-:-:-:4 F2F.F32.F16 loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 F2F.F32.F16 loadB1, loadB1; +--:-:-:-:4 F2F.F32.F16 loadB2, loadB2; +--:-:6:-:2 F2F.F32.F16 loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +20:-:-:-:1 STS.128 [writeBs], loadB0; + +02:-:-:-:4 F2F.F32.F16 loadA0, loadA0; +04:-:2:-:4 F2F.F32.F16 loadA1, loadA1; +--:-:-:-:0 IADD track0A0.CC, track0A0, 2x<8>; +--:-:-:-:4 F2F.F32.F16 loadA2, loadA2; +--:-:3:-:4 F2F.F32.F16 loadA3, loadA3; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; +--:-:-:-:4 F2F.F32.F16 loadA4, loadA4; +--:-:-:-:0 IADD track1A0.CC, track1A0, 2x<8>; +--:-:4:-:4 F2F.F32.F16 loadA5, loadA5; +--:-:-:-:4 F2F.F32.F16 loadA6, loadA6; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; +--:-:5:-:1 F2F.F32.F16 loadA7, loadA7; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +02:-:-:-:1 STS.64 [writeAs + 4x<0*128>], loadA0; +04:-:-:-:1 STS.64 [writeAs + 4x<1*128>], loadA2; +08:-:-:-:1 STS.64 [writeAs + 4x<2*128>], loadA4; +10:-:-:-:1 STS.64 [writeAs + 4x<3*128>], loadA6; + }; + + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + + + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c15 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P5;\n", + + j0c10 => "--:-:2:-:1 \@P3 LDG.E.CI.64 loadB0, [trackB];\n", + + j0c28 => "--:-:4:-:1 \@P2 LDG.E.CI.64 loadA2, [track1A + 2x<0>];\n", + j0c30 => "--:-:4:-:1 \@P2 LDG.E.CI.64 loadAA2, [track1A + 2x<8>];\n", + + j0c31 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P4;\n", + + j0c44 => "--:-:5:-:1 \@P2 LDG.E.CI.64 loadA0, [track0A + 2x<0>];\n", + j0c46 => "--:-:6:-:1 \@P2 LDG.E.CI.64 loadAA0, [track0A + 2x<8>];\n", + + j3c53 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA7, loadAA3.H1;\n", + j3c57 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA6, loadAA1.H1;\n", + j3c61 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA5, loadAA3.H0;\n", + j4c1 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA4, loadAA1.H0;\n", + j4c5 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA3, loadAA2.H1;\n", + j4c9 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA1, loadAA2.H0;\n", + j4c13 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA2, loadAA0.H1;\n", + j4c17 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA0, loadAA0.H0;\n", + + j5c5 => "02:-:-:-:1 \@P3 F2F.F32.F16 loadB3, loadB1.H1;\n", + j5c9 => "--:-:-:-:1 \@P3 F2F.F32.F16 loadB2, loadB1.H0;\n", + j5c13 => "--:-:-:-:1 \@P3 F2F.F32.F16 loadB1, loadB0.H1;\n", + j5c17 => "--:-:2:-:1 \@P3 F2F.F32.F16 loadB0, loadB0.H0;\n", + + j5c35 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j5c53 => "08:-:-:-:1 \@P1 F2F.F32.F16 loadA7, loadA3.H1;\n", + j5c57 => "10:-:2:-:1 \@P1 F2F.F32.F16 loadA6, loadA1.H1;\n", + j5c61 => "--:-:-:-:1 \@P1 F2F.F32.F16 loadA5, loadA3.H0;\n", + j6c1 => "--:-:3:-:1 \@P1 F2F.F32.F16 loadA4, loadA1.H0;\n", + j6c5 => "--:-:-:-:1 \@P1 F2F.F32.F16 loadA3, loadA2.H1;\n", + j6c9 => "--:-:-:-:1 \@P1 F2F.F32.F16 loadA1, loadA2.H0;\n", + j6c13 => "--:-:4:-:1 \@P1 F2F.F32.F16 loadA2, loadA0.H1;\n", + j6c17 => "--:-:5:-:1 \@P1 F2F.F32.F16 loadA0, loadA0.H0;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS.64 [writeAs + 4x<3*128>], loadA6;\n", + j6c31 => "04:-:-:-:1 \@P0 STS.64 [writeAs + 4x<2*128>], loadA4;\n", + j6c33 => "08:-:-:-:1 \@P0 STS.64 [writeAs + 4x<1*128>], loadA2;\n", + j6c35 => "10:-:-:-:1 \@P0 STS.64 [writeAs + 4x<0*128>], loadA0;\n", + + j6c46 => "--:-:-:-:1 \@P1 IADD track1A0.CC, track1A0, 2x<16>;\n", + j6c54 => "--:-:-:-:1 \@P1 IADD.X track1A1, track1A1, RZ;\n", + j7c55 => "20:-:-:-:1 \@P1 IADD track0A0.CC, track0A0, 2x<16>;\n", + j7c61 => "--:-:-:-:1 \@P1 IADD.X track0A1, track0A1, RZ;\n", + + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P4;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + j0c10 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j0c12 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j0c14 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j0c16 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j0c33 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA0, [track0A + 2x<0>];\n", + j0c35 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA2, [track0A + 2x<1>];\n", + j0c37 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA4, [track0A + 2x<2>];\n", + j0c39 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA6, [track0A + 2x<3>];\n", + + j0c41 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + + j1c29 => "--:-:3:-:1 \@P2 LDG.E.CI.S16 loadA1, [track1A + 2x<0>];\n", + j1c31 => "--:-:3:-:1 \@P2 LDG.E.CI.S16 loadA3, [track1A + 2x<1>];\n", + j1c33 => "--:-:3:-:1 \@P2 LDG.E.CI.S16 loadA5, [track1A + 2x<2>];\n", + j1c35 => "--:-:3:-:1 \@P2 LDG.E.CI.S16 loadA7, [track1A + 2x<3>];\n", + + j5c8 => "20:-:-:-:1 \@P3 F2F.F32.F16 loadB0, loadB0;\n", + j5c12 => "--:-:-:-:1 \@P3 F2F.F32.F16 loadB1, loadB1;\n", + j5c16 => "--:-:-:-:1 \@P3 F2F.F32.F16 loadB2, loadB2;\n", + j5c20 => "--:-:6:-:1 \@P3 F2F.F32.F16 loadB3, loadB3;\n", + + j5c39 => "20:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j5c53 => "02:-:-:-:1 \@P0 F2F.F32.F16 loadA0, loadA0;\n", + j5c57 => "04:-:2:-:1 \@P0 F2F.F32.F16 loadA1, loadA1;\n", + j5c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 loadA2, loadA2;\n", + j6c1 => "--:-:3:-:1 \@P0 F2F.F32.F16 loadA3, loadA3;\n", + j6c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 loadA4, loadA4;\n", + j6c9 => "--:-:4:-:1 \@P0 F2F.F32.F16 loadA5, loadA5;\n", + j6c13 => "--:-:-:-:1 \@P0 F2F.F32.F16 loadA6, loadA6;\n", + j6c17 => "--:-:5:-:1 \@P0 F2F.F32.F16 loadA7, loadA7;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS.64 [writeAs + 4x<0*128>], loadA0;\n", + j6c31 => "04:-:-:-:1 \@P0 STS.64 [writeAs + 4x<1*128>], loadA2;\n", + j6c33 => "08:-:-:-:1 \@P0 STS.64 [writeAs + 4x<2*128>], loadA4;\n", + j6c35 => "10:-:-:-:1 \@P0 STS.64 [writeAs + 4x<3*128>], loadA6;\n", + + j6c46 => "--:-:-:-:1 \@P0 IADD track0A0.CC, track0A0, 2x<8>;\n", + j6c54 => "--:-:-:-:1 \@P0 IADD.X track0A1, track0A1, RZ;\n", + j6c55 => "--:-:-:-:1 \@P0 IADD track1A0.CC, track1A0, 2x<8>;\n", + j6c61 => "--:-:-:-:1 \@P0 IADD.X track1A1, track1A1, RZ;\n", + ) + ), + + j5c46 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:0 IADD32I k, k, -8;\n" . + "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + (j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n") : + (j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n") + ), + ); + return; + + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_16x64.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_16x64.sass new file mode 100644 index 0000000..1dfb949 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nn_16x64.sass @@ -0,0 +1,1171 @@ +# Kernel: hgemm_nn_16x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(16*64 + 32)*2 + 64*64*2> + szShareA : (16*64 + 32) + szShareB : (64*64) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24|32|40|48|56>, tid16_8, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta + + 96-135 : load0A<0-7>, load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>, load4B<0-3>, load5B<0-3>, load6B<0-3>, load7B<0-3> + 136-153 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>, track4B<0-1>, track5B<0-1>, track6B<0-1>, track7B<0-1> + + 154-161 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb64 + 162-171 ~ tid, blkA, blkB, blkZ, writeCs, preds, tid16 + + 0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3> + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + 96-99 : loadC<0-3> + 100-103 : b<0-3> + 104-107 : c<0-3> + 108-109 : C<0-1> + 110-161 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb8, ldb, 3; +--:-:-:-:1 SHL ldb64, ldb, 7; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// tidAX = tid >> 3 +// tidAY = (tid & 7) << 3 +// shiftAX = (tid & 7) << 2 +01:-:-:-:1 SHR.U32 tidAX, tid, 3; +--:-:-:-:1 LOP.AND tidAY, tid, 7; +--:-:-:-:1 SHL shiftAX, tidAY, 2; +--:-:-:-:1 SHL tidAY, tidAY, 3; + +// tidBX = (tid & 15) << 2 +// tidBY = tid >> 4 +01:-:-:-:1 LOP.AND tidBX, tid, 15; +--:-:-:-:1 SHL tidBX, tidBX, 2; +01:-:-:-:1 SHR.U32 tidBY, tid, 4; + +--:-:-:-:1 IADD tidBY8, tidBY, 8; +--:-:-:-:1 IADD tidBY16, tidBY, 16; +--:-:-:-:1 IADD tidBY24, tidBY, 24; +--:-:-:-:1 IADD tidBY32, tidBY, 32; +--:-:-:-:1 IADD tidBY40, tidBY, 40; +--:-:-:-:1 IADD tidBY48, tidBY, 48; +--:-:-:-:1 IADD tidBY56, tidBY, 56; + +// trackA += ((blkA*16 + tidAX) * lda + tidAY) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 4; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa, param_m, PT; + +// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; + +--:-:-:-:1 LEA track0B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track1B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track2B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track3B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track4B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track4B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track5B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track5B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track6B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track6B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track7B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track7B1, tb, param_B[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P3, PT, txb, param_n, PT; +[+ + our $vec; + return $vec ? '' : q{ +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb3, param_n, PT; + }; ++] +--:-:-:-:1 P2R preds, PR, RZ, 0x7c; + +// writeAs = (tidAY*16 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 4; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (tidBY*64 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (tid & 1) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHL readBs, readBs, 4; + +// tid16 = tid & -16 +// tid16_8 = tid16 / 2 * 4 +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHL tid16_8, tid16, 1; + +// writeCs = (readAs + tid16*2) * 64 + readBs; +--:-:-:-:1 ISCADD writeCs, tid16, readAs, 1; +--:-:-:-:1 ISCADD writeCs, writeCs, readBs, 6; + +// Each block of 16 threads works on 8 lines, readAs is also shifted over by 4 +// readAs += tid16_8 * 16 + tid16 +// readBs += tid16_8 * 64 + 4x +--:-:-:-:1 ISCADD readAs, tid16_8, readAs, 4; +--:-:-:-:1 ISCADD readBs, tid16_8, readBs, 6; +--:-:-:-:1 IADD readAs, tid16, readAs; +--:-:-:-:1 IADD readBs, readBs, 4x; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// If k is not a multiple of 64 we want to grab the partial amount on the first fetch. +// If it is a multiple of 64 then make a full 64 line fetch. +--:-:-:-:1 LOP.AND.Z P0, partialK, k, 63; +--:-:-:-:1 @P0 MOV partialK, 64; +--:-:-:-:1 IADD k, k, -partialK; +[+ + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY8, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY16, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY24, partialK, P3; + +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P1 LDG.E.CI.64 load0B, [track0B]; +--:-:3:-:1 @P4 LDG.E.CI.64 load1B, [track1B]; +--:-:4:-:1 @P5 LDG.E.CI.64 load2B, [track2B]; +--:-:4:-:1 @P6 LDG.E.CI.64 load3B, [track3B]; + + +--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:-:-:1 @!P1 LDS.U.64 load0B, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.64 load1B, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.64 load2B, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.64 load3B, [addr_zero]; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY32, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY40, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY48, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY56, partialK, P3; + +--:-:5:-:1 @P1 LDG.E.CI.64 load4B, [track4B]; +--:-:5:-:1 @P4 LDG.E.CI.64 load5B, [track5B]; +--:-:6:-:1 @P5 LDG.E.CI.64 load6B, [track6B]; +--:-:6:-:1 @P6 LDG.E.CI.64 load7B, [track7B]; + + +--:-:-:-:1 @!P1 LDS.U.64 load4B, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.64 load5B, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.64 load6B, [addr_zero]; +--:-:1:-:1 @!P6 LDS.U.64 load7B, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY1, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY2, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY3, partialK, P2; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:2:-:1 @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load0A0, RZ; +--:-:-:-:1 @!P4 MOV load0A1, RZ; +--:-:-:-:1 @!P5 MOV load0A2, RZ; +--:-:-:-:1 @!P6 MOV load0A3, RZ; + +--:-:-:-:1 IADD tidAY, tidAY, 4; +--:-:-:-:1 IADD tidAY1, tidAY1, 4; +--:-:-:-:1 IADD tidAY2, tidAY2, 4; +--:-:-:-:1 IADD tidAY3, tidAY3, 4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY1, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY2, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY3, partialK, P2; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @!P3 MOV load0A4, RZ; +--:-:-:-:1 @!P4 MOV load0A5, RZ; +--:-:-:-:1 @!P5 MOV load0A6, RZ; +--:-:-:-:1 @!P6 MOV load0A7, RZ; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load0B0, RZ; +--:-:-:-:1 @!P4 MOV load0B1, RZ; +--:-:-:-:1 @!P5 MOV load0B2, RZ; +--:-:-:-:1 @!P6 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY8, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x78; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load1B0, RZ; +--:-:-:-:1 @!P4 MOV load1B1, RZ; +--:-:-:-:1 @!P5 MOV load1B2, RZ; +--:-:-:-:1 @!P6 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY16, partialK, PT; +--:-:-:-:1 @P2 R2P PR, preds, 0x78; +--:-:-:-:1 @!P2 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load2B0, RZ; +--:-:-:-:1 @!P4 MOV load2B1, RZ; +--:-:-:-:1 @!P5 MOV load2B2, RZ; +--:-:-:-:1 @!P6 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY24, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load3B0, RZ; +--:-:-:-:1 @!P4 MOV load3B1, RZ; +--:-:-:-:1 @!P5 MOV load3B2, RZ; +--:-:-:-:1 @!P6 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY32, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x78; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load4B0, RZ; +--:-:-:-:1 @!P4 MOV load4B1, RZ; +--:-:-:-:1 @!P5 MOV load4B2, RZ; +--:-:-:-:1 @!P6 MOV load4B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY40, partialK, PT; +--:-:-:-:1 @P2 R2P PR, preds, 0x78; +--:-:-:-:1 @!P2 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load5B0, RZ; +--:-:-:-:1 @!P4 MOV load5B1, RZ; +--:-:-:-:1 @!P5 MOV load5B2, RZ; +--:-:-:-:1 @!P6 MOV load5B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY48, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load6B0, RZ; +--:-:-:-:1 @!P4 MOV load6B1, RZ; +--:-:-:-:1 @!P5 MOV load6B2, RZ; +--:-:-:-:1 @!P6 MOV load6B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY56, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x78; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load7B0, RZ; +--:-:-:-:1 @!P4 MOV load7B1, RZ; +--:-:-:-:1 @!P5 MOV load7B2, RZ; +--:-:-:-:1 @!P6 MOV load7B3, RZ; + }; ++] +// partialB = partialK * ldb +--:-:-:-:1 XMAD.LO2 partialB, ldb, partialK, RZ; + +--:-:-:-:1 ISETP.GE.AND P1, PT, k, 64, PT; +--:-:-:-:1 IADD k, k, -64; +--:-:-:-:1 @P1 R2P PR, preds, 0x7c; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c; + + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:1 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A2.H1; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0.H0; + } : q{ +02:-:-:-:1 F2F.F32.F16 load0A7, load0A7; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A6; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A5; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A4; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A3; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0; + }; ++] +--:-:-:-:0 LEA track0A0.CC, partialK, track0A0, 1; +01:-:-:-:1 STS [writeAs + 4x<7*16>], load0A7; +--:-:-:-:1 STS [writeAs + 4x<6*16>], load0A6; +--:-:-:-:1 STS [writeAs + 4x<5*16>], load0A5; +--:-:-:-:1 STS [writeAs + 4x<4*16>], load0A4; +02:-:-:-:1 STS [writeAs + 4x<3*16>], load0A3; +--:-:-:-:1 STS [writeAs + 4x<2*16>], load0A2; +--:-:-:-:1 STS [writeAs + 4x<1*16>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<0*16>], load0A0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +[+ + our $vec; + return $vec ? q{ +04:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:1:-:1 F2F.F32.F16 load0B0, load0B0.H0; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + } : q{ +04:-:-:-:1 F2F.F32.F16 load0B0, load0B0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:1:-:1 F2F.F32.F16 load0B3, load0B3; +--:-:-:-:1 F2F.F32.F16 load1B0, load1B0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:2:-:1 F2F.F32.F16 load1B3, load1B3; + }; ++] +--:-:-:-:0 LEA track0B0.CC, partialB, track0B0, 1; +01:-:-:-:6 STS.128 [writeBs + 4x<0*64>], load0B; +--:-:-:-:1 IADD.X track0B1, track0B1, RZ; + +--:-:-:-:0 LEA track1B0.CC, partialB, track1B0, 1; +02:-:-:-:6 STS.128 [writeBs + 4x<8*64>], load1B; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +[+ + our $vec; + return $vec ? q{ +08:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:1:-:1 F2F.F32.F16 load2B0, load2B0.H0; +--:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:2:-:1 F2F.F32.F16 load3B0, load3B0.H0; + } : q{ +08:-:-:-:1 F2F.F32.F16 load2B0, load2B0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:1:-:1 F2F.F32.F16 load2B3, load2B3; +--:-:-:-:1 F2F.F32.F16 load3B0, load3B0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:2:-:1 F2F.F32.F16 load3B3, load3B3; + }; ++] +--:-:-:-:0 LEA track2B0.CC, partialB, track2B0, 1; +01:-:-:-:6 STS.128 [writeBs + 4x<16*64>], load2B; +--:-:-:-:1 IADD.X track2B1, track2B1, RZ; + +--:-:-:-:0 LEA track3B0.CC, partialB, track3B0, 1; +02:-:-:-:6 STS.128 [writeBs + 4x<24*64>], load3B; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +[+ + our $vec; + return $vec ? q{ +10:-:-:-:1 F2F.F32.F16 load4B3, load4B1.H1; +--:-:-:-:1 F2F.F32.F16 load4B2, load4B1.H0; +--:-:-:-:1 F2F.F32.F16 load4B1, load4B0.H1; +--:-:1:-:1 F2F.F32.F16 load4B0, load4B0.H0; +--:-:-:-:1 F2F.F32.F16 load5B3, load5B1.H1; +--:-:-:-:1 F2F.F32.F16 load5B2, load5B1.H0; +--:-:-:-:1 F2F.F32.F16 load5B1, load5B0.H1; +--:-:2:-:1 F2F.F32.F16 load5B0, load5B0.H0; + } : q{ +10:-:-:-:1 F2F.F32.F16 load4B0, load4B0; +--:-:-:-:1 F2F.F32.F16 load4B1, load4B1; +--:-:-:-:1 F2F.F32.F16 load4B2, load4B2; +--:-:1:-:1 F2F.F32.F16 load4B3, load4B3; +--:-:-:-:1 F2F.F32.F16 load5B0, load5B0; +--:-:-:-:1 F2F.F32.F16 load5B1, load5B1; +--:-:-:-:1 F2F.F32.F16 load5B2, load5B2; +--:-:2:-:1 F2F.F32.F16 load5B3, load5B3; + }; ++] +--:-:-:-:0 LEA track4B0.CC, partialB, track4B0, 1; +01:-:-:-:6 STS.128 [writeBs + 4x<32*64>], load4B; +--:-:-:-:1 IADD.X track4B1, track4B1, RZ; + +--:-:-:-:0 LEA track5B0.CC, partialB, track5B0, 1; +02:-:-:-:6 STS.128 [writeBs + 4x<40*64>], load5B; +--:-:-:-:0 IADD.X track5B1, track5B1, RZ; + +[+ + our $vec; + return $vec ? q{ +20:-:-:-:1 F2F.F32.F16 load6B3, load6B1.H1; +--:-:-:-:1 F2F.F32.F16 load6B2, load6B1.H0; +--:-:-:-:1 F2F.F32.F16 load6B1, load6B0.H1; +--:-:1:-:1 F2F.F32.F16 load6B0, load6B0.H0; +--:-:-:-:1 F2F.F32.F16 load7B3, load7B1.H1; +--:-:-:-:1 F2F.F32.F16 load7B2, load7B1.H0; +--:-:-:-:1 F2F.F32.F16 load7B1, load7B0.H1; +--:-:2:-:1 F2F.F32.F16 load7B0, load7B0.H0; + } : q{ +20:-:-:-:1 F2F.F32.F16 load6B0, load6B0; +--:-:-:-:1 F2F.F32.F16 load6B1, load6B1; +--:-:-:-:1 F2F.F32.F16 load6B2, load6B2; +--:-:1:-:1 F2F.F32.F16 load6B3, load6B3; +--:-:-:-:1 F2F.F32.F16 load7B0, load7B0; +--:-:-:-:1 F2F.F32.F16 load7B1, load7B1; +--:-:-:-:1 F2F.F32.F16 load7B2, load7B2; +--:-:2:-:1 F2F.F32.F16 load7B3, load7B3; + }; ++] +--:-:-:-:0 LEA track6B0.CC, partialB, track6B0, 1; +01:-:-:-:6 STS.128 [writeBs + 4x<48*64>], load6B; +--:-:-:-:1 IADD.X track6B1, track6B1, RZ; + +--:-:-:-:0 LEA track7B0.CC, partialB, track7B0, 1; +02:-:-:-:6 STS.128 [writeBs + 4x<56*64>], load7B; +--:-:-:-:0 IADD.X track7B1, track7B1, RZ; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*16 + 00>]; +--:-:-:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*16 + 08>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +[+ + our $vec; + return $vec ? q{ +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.64 load0B, [track0B]; +--:-:3:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load2B, [track2B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load3B, [track3B]; +--:-:5:-:1 @P3 LDG.E.CI.64 load4B, [track4B]; +--:-:5:-:1 @P3 LDG.E.CI.64 load5B, [track5B]; +--:-:6:-:1 @P3 LDG.E.CI.64 load6B, [track6B]; +--:-:6:-:1 @P3 LDG.E.CI.64 load7B, [track7B]; + } : q{ +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>]; + }; ++] + +LOOP: + +[+ + our $vec; + our %insert = + ( + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, RZ, PT;\n", + j0c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, k, 64, PT;\n" . + "--:-:-:-:1 IADD k, k, -64;\n", + + j0c23 => "--:-:-:-:1 \@P1 R2P PR, preds, 0x7c;\n", + j0c24 => "--:-:-:-:1 \@!P1 R2P PR, RZ, 0x7c;\n", + + j2c32 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<64>;\n", + j2c37 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + + j3c25 => "--:-:-:-:1 \@P3 IADD track0B0.CC, track0B0, ldb64;\n", + j3c30 => "--:-:-:-:1 \@P3 IADD.X track0B1, track0B1, RZ;\n", + j3c32 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, ldb64;\n", + j3c37 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + + j4c25 => "--:-:-:-:1 \@P3 IADD track2B0.CC, track2B0, ldb64;\n", + j4c30 => "--:-:-:-:1 \@P3 IADD.X track2B1, track2B1, RZ;\n", + j4c32 => "--:-:-:-:1 \@P3 IADD track3B0.CC, track3B0, ldb64;\n", + j4c37 => "--:-:-:-:1 \@P3 IADD.X track3B1, track3B1, RZ;\n", + + j5c25 => "--:-:-:-:1 \@P3 IADD track4B0.CC, track4B0, ldb64;\n", + j5c30 => "--:-:-:-:1 \@P3 IADD.X track4B1, track4B1, RZ;\n", + j5c32 => "--:-:-:-:1 \@P3 IADD track5B0.CC, track5B0, ldb64;\n", + j5c37 => "--:-:-:-:1 \@P3 IADD.X track5B1, track5B1, RZ;\n", + + j6c25 => "--:-:-:-:1 \@P3 IADD track6B0.CC, track6B0, ldb64;\n", + j6c30 => "--:-:-:-:1 \@P3 IADD.X track6B1, track6B1, RZ;\n", + j6c32 => "--:-:-:-:1 \@P3 IADD track7B0.CC, track7B0, ldb64;\n", + j6c37 => "--:-:-:-:1 \@P3 IADD.X track7B1, track7B1, RZ;\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j2c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<7*16>], load0A7;\n", + j2c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*16>], load0A6;\n", + j2c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*16>], load0A5;\n", + j2c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*16>], load0A4;\n", + j2c24 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*16>], load0A3;\n", + j2c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*16>], load0A2;\n", + j2c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*16>], load0A1;\n", + j2c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<0*16>], load0A0;\n", + + j3c16 => "04:-:-:-:1 \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n", + j3c20 => "--:3:-:-:1 \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n", + + j4c16 => "08:-:-:-:1 \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n", + j4c20 => "--:4:-:-:1 \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n", + + j5c16 => "10:-:-:-:1 \@P0 STS.128 [writeBs + 4x<32*64>], load4B;\n", + j5c20 => "--:5:-:-:1 \@P0 STS.128 [writeBs + 4x<40*64>], load5B;\n", + + j6c16 => "20:-:-:-:1 \@P0 STS.128 [writeBs + 4x<48*64>], load6B;\n", + j6c20 => "--:6:-:-:1 \@P0 STS.128 [writeBs + 4x<56*64>], load7B;\n", + + ($vec ? + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n", + j1c63 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j2c36 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n", + j2c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j3c36 => "08:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n", + j3c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n", + j3c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n", + j3c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n", + j3c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j4c36 => "10:-:-:-:1 \@P0 F2F.F32.F16 load4B3, load4B1.H1;\n", + j4c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B2, load4B1.H0;\n", + j4c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B1, load4B0.H1;\n", + j4c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B0, load4B0.H0;\n", + j4c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B3, load5B1.H1;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B2, load5B1.H0;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B1, load5B0.H1;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load5B0, load5B0.H0;\n", + + j5c36 => "20:-:-:-:1 \@P0 F2F.F32.F16 load6B3, load6B1.H1;\n", + j5c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B2, load6B1.H0;\n", + j5c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B1, load6B0.H1;\n", + j5c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B0, load6B0.H0;\n", + j5c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B3, load7B1.H1;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B2, load7B1.H0;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B1, load7B0.H1;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load7B0, load7B0.H0;\n", + + j2c61 => "02:-:2:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j3c60 => "04:-:-:-:1 \@P3 LDG.E.CI.64 load0B, [track0B];\n", + j3c62 => "--:-:3:-:1 \@P3 LDG.E.CI.64 load1B, [track1B];\n", + j4c60 => "08:-:-:-:1 \@P3 LDG.E.CI.64 load2B, [track2B];\n", + j4c62 => "--:-:4:-:1 \@P3 LDG.E.CI.64 load3B, [track3B];\n", + j5c60 => "10:-:-:-:1 \@P3 LDG.E.CI.64 load4B, [track4B];\n", + j5c62 => "--:-:5:-:1 \@P3 LDG.E.CI.64 load5B, [track5B];\n", + j6c60 => "20:-:-:-:1 \@P3 LDG.E.CI.64 load6B, [track6B];\n", + j6c62 => "--:-:6:-:1 \@P3 LDG.E.CI.64 load7B, [track7B];\n", + ) : + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A0, load0A0;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A1;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A2;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A3;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A4;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A5;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A6;\n", + j1c63 => "--:2:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A7;\n", + + j2c36 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0;\n", + j2c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B1;\n", + j2c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B2;\n", + j2c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B3;\n", + j2c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B0, load1B0;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B1;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B2;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load1B3, load1B3;\n", + + j3c36 => "08:-:-:-:1 \@P0 F2F.F32.F16 load2B0, load2B0;\n", + j3c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B1;\n", + j3c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B2;\n", + j3c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B3;\n", + j3c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B0, load3B0;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B1;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B2;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load3B3, load3B3;\n", + + j4c36 => "10:-:-:-:1 \@P0 F2F.F32.F16 load4B0, load4B0;\n", + j4c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B1, load4B1;\n", + j4c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B2, load4B2;\n", + j4c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B3, load4B3;\n", + j4c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B0, load5B0;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B1, load5B1;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B2, load5B2;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load5B3, load5B3;\n", + + j5c36 => "20:-:-:-:1 \@P0 F2F.F32.F16 load6B0, load6B0;\n", + j5c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B1, load6B1;\n", + j5c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B2, load6B2;\n", + j5c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B3, load6B3;\n", + j5c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B0, load7B0;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B1, load7B1;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B2, load7B2;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load7B3, load7B3;\n", + + j2c48 => "02:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j2c50 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j2c52 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j2c54 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + j2c56 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n", + j2c58 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n", + j2c60 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n", + j2c62 => "--:-:2:-:1 \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n", + + j3c48 => "04:-:-:-:1 \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j3c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j3c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + j3c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j3c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j3c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j3c62 => "--:-:3:-:1 \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + + j4c48 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j4c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j4c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j4c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + j4c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j4c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j4c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j4c62 => "--:-:4:-:1 \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + + j5c48 => "10:-:-:-:1 \@P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>];\n", + j5c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>];\n", + j5c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>];\n", + j5c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>];\n", + j5c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>];\n", + j5c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>];\n", + j5c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>];\n", + j5c62 => "--:-:5:-:1 \@P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>];\n", + + j6c48 => "20:-:-:-:1 \@P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>];\n", + j6c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>];\n", + j6c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>];\n", + j6c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>];\n", + j6c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>];\n", + j6c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>];\n", + j6c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>];\n", + j6c62 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>];\n", + ) + ), + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out = ''; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*16 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*16 + 08>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHR.U32 tid16, tid, 4; +--:-:-:-:1 SHL tid15, tid15, 2; +--:-:-:-:1 ISCADD readCs, tid16, tid15, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*64 + tid15; +--:-:-:-:1 ISCADD cx, blkB, tid15, 6; +--:-:-:-:1 IADD cx1, cx, 1; +--:-:-:-:1 IADD cx2, cx, 2; +--:-:-:-:1 IADD cx3, cx, 3; + +// cy = blkA*16 + tid16 +--:-:-:-:1 ISCADD cy, blkA, tid16, 4; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 SHL ldc8, ldc, 4; + +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C1, ci, param_C[1], RZ, 1; + +// P0 = cx < n +--:-:-:-:1 ISETP.LT.AND P0, PT, cx, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, cx1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, cx2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, cx3, param_n, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +// P4 = cy < m +--:-:-:-:1 ISETP.LT.AND P4, PT, cy, param_m, PT; + +// P5 = beta != 0 && P4 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P4; + +// P6 = Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; + +// Init beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:5 EXIT; + +STORE_C: + +[+ + our $vec; + return $vec ? q{ +--:-:1:-:1 @P0 LDG.E.64 loadC, [C]; + } : q{ +--:-:-:-:0 @!P0 MOV loadC0, RZ; +--:-:-:-:1 @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>]; +--:-:-:-:0 @!P1 MOV loadC1, RZ; +--:-:-:-:1 @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>]; +--:-:-:-:0 @!P2 MOV loadC2, RZ; +--:-:-:-:1 @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>]; +--:-:-:-:0 @!P3 MOV loadC3, RZ; +--:-:1:-:1 @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>]; + }; ++] + +// Restore output preds +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 LDS.U.128 part0C, [readCs + 4x<0*8*64>]; +--:-:2:-:1 LDS.U.128 part1C, [readCs + 4x<1*8*64>]; +--:-:-:-:1 LDS.U.128 part2C, [readCs + 4x<2*8*64>]; +--:-:3:-:1 LDS.U.128 part3C, [readCs + 4x<3*8*64>]; +--:-:-:-:1 LDS.U.128 part4C, [readCs + 4x<4*8*64>]; +--:-:4:-:1 LDS.U.128 part5C, [readCs + 4x<5*8*64>]; +--:-:-:-:1 LDS.U.128 part6C, [readCs + 4x<6*8*64>]; +--:-:5:-:1 LDS.U.128 part7C, [readCs + 4x<7*8*64>]; + + +02:-:-:-:1 @P0 FADD part0C0, part0C0, part1C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part1C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part1C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part1C3; + +04:-:-:-:1 @P0 FADD part2C0, part2C0, part3C0; +--:-:-:-:1 @P1 FADD part2C1, part2C1, part3C1; +--:-:-:-:1 @P2 FADD part2C2, part2C2, part3C2; +--:-:-:-:1 @P3 FADD part2C3, part2C3, part3C3; + +08:-:-:-:1 @P0 FADD part4C0, part4C0, part5C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part5C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part5C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part5C3; + +10:-:-:-:1 @P0 FADD part6C0, part6C0, part7C0; +--:-:-:-:1 @P1 FADD part6C1, part6C1, part7C1; +--:-:-:-:1 @P2 FADD part6C2, part6C2, part7C2; +--:-:-:-:1 @P3 FADD part6C3, part6C3, part7C3; + +--:-:-:-:1 @P0 FADD part0C0, part0C0, part2C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part2C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part2C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part2C3; + +--:-:-:-:1 @P0 FADD part4C0, part4C0, part6C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part6C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part6C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part6C3; + +--:-:-:-:1 @P0 FADD c0, part0C0, part4C0; +--:-:-:-:1 @P1 FADD c1, part0C1, part4C1; +--:-:-:-:1 @P2 FADD c2, part0C2, part4C2; +--:-:-:-:1 @P3 FADD c3, part0C3, part4C3; + + +--:-:-:-:0 IADD cy, cy, 8; + +[+ + our $vec; + return $vec ? q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0.H0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC0.H1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC1.H0; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC1.H1; + } : q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC2; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC3; + }; ++] + +01:-:-:-:1 @P5 FFMA c0, b0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, b1, beta, c1; +04:-:-:-:1 @P5 FFMA c2, b2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, b3, beta, c3; + +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:0 ISETP.LT.AND P5, PT, cy, param_m, P5; + +--:-:1:-:1 @P0 F2F.F16.F32 c0, c0; +--:-:2:-:1 @P1 F2F.F16.F32 c1, c1; + +--:-:-:-:0 ISETP.LT.AND P4, PT, cy, param_m, PT; + +--:-:3:-:1 @P2 F2F.F16.F32 c2, c2; +--:-:4:-:1 @P3 F2F.F16.F32 c3, c3; + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:2 @P0 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 @P0 BFI c1, c3, 0x1010, c2; + +--:1:-:-:1 @P0 STG.E.CG.64 [C], c; + } : q{ +01:-:-:-:1 @P0 STG.E.U16 [C + 2x<0>], c0; +02:-:-:-:1 @P1 STG.E.U16 [C + 2x<1>], c1; +04:-:-:-:1 @P2 STG.E.U16 [C + 2x<2>], c2; +08:1:-:-:1 @P3 STG.E.U16 [C + 2x<3>], c3; + }; ++] + +// Restore beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +01:-:-:-:6 IADD C0.CC, C0, ldc8; +--:-:-:-:0 IADD.X C1, C1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_32x128.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_32x128.sass new file mode 100644 index 0000000..8c4510d --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nn_32x128.sass @@ -0,0 +1,562 @@ +# Kernel: hgemm_nn_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + (32*16 + 32)*2> + szShareA : (32*16 + 32) + szShareB : (128*16) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadA<0-3> + 84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + + 100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; +--:-:3:-:1 S2R blkA, SR_CTAID.Y; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb4, ldb, 2; +--:-:-:-:1 SHL ldb16, ldb, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) +01:-:-:-:1 LOP.AND tidBX, tid, 31; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 5; + +// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2 +04:-:-:-:1 ISCADD txa, blkA, tidAX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 1; + +// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4 +02:-:-:-:1 ISCADD txb, blkB, tidBX, 7; +--:-:-:-:1 XMAD.LO2 tb0, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb0, ldbz, blkZ, tb0; +--:-:-:-:1 IADD tb1, tb0, ldb4; +--:-:-:-:1 IADD tb2, tb1, ldb4; +--:-:-:-:1 IADD tb3, tb2, ldb4; + +--:-:-:-:1 LEA track0B0.CC, tb0, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb0, param_B[1], RZ, 1; +--:-:-:-:1 LEA track1B0.CC, tb1, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb1, param_B[1], RZ, 1; +--:-:-:-:1 LEA track2B0.CC, tb2, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb2, param_B[1], RZ, 1; +--:-:-:-:1 LEA track3B0.CC, tb3, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb3, param_B[1], RZ, 1; + +// writeAs = (tidAY*32 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*128 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 7; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readBs, readBs, tid96; +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidBY1, tidBY, 4; +--:-:-:-:1 IADD tidBY2, tidBY, 8; +--:-:-:-:1 IADD tidBY3, tidBY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txb, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidBY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.CI.64 load0B, [track0B]; +--:-:2:-:1 @P1 LDG.E.CI.64 load1B, [track1B]; +--:-:3:-:1 @P2 LDG.E.CI.64 load2B, [track2B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load3B, [track3B]; +--:-:5:-:1 @P4 LDG.E.CI.64 loadA, [trackA]; + + + +--:-:6:-:1 @!P0 LDS.U.64 load0B, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.64 load1B, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.64 load2B, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 load3B, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.64 loadA, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:1:-:1 @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:1:-:1 @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:1:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:1:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P5; + +--:-:2:-:1 @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:3:-:1 @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:4:-:1 @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P6; + +--:-:5:-:1 @P0 LDG.E.CI.U16 loadA0, [trackA + 2x<0>]; +--:-:5:-:1 @P1 LDG.E.CI.U16 loadA1, [trackA + 2x<1>]; +--:-:5:-:1 @P2 LDG.E.CI.U16 loadA2, [trackA + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 loadA3, [trackA + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P0, RZ, k, 15; +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 16, P0; + + + + + our $vec; + return $vec ? q{ +21:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:1:-:1 F2F.F32.F16 load0B0, load0B0.H0; + +02:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + +04:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:3:-:1 F2F.F32.F16 load2B0, load2B0.H0; + +08:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:4:-:1 F2F.F32.F16 load3B0, load3B0.H0; + +10:-:-:-:1 F2F.F32.F16 loadA3, loadA1.H1; +--:-:-:-:1 F2F.F32.F16 loadA2, loadA1.H0; +--:-:-:-:1 F2F.F32.F16 loadA1, loadA0.H1; +--:-:5:-:1 F2F.F32.F16 loadA0, loadA0.H0; + } : q{ +21:-:-:-:1 F2F.F32.F16 load0B0, load0B0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:1:-:1 F2F.F32.F16 load0B3, load0B3; + +02:-:-:-:1 F2F.F32.F16 load1B0, load1B0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:2:-:1 F2F.F32.F16 load1B3, load1B3; + +04:-:-:-:1 F2F.F32.F16 load2B0, load2B0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:3:-:1 F2F.F32.F16 load2B3, load2B3; + +08:-:-:-:1 F2F.F32.F16 load3B0, load3B0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:4:-:1 F2F.F32.F16 load3B3, load3B3; + +10:-:-:-:1 F2F.F32.F16 loadA0, loadA0; +--:-:-:-:1 F2F.F32.F16 loadA1, loadA1; +--:-:-:-:1 F2F.F32.F16 loadA2, loadA2; +--:-:5:-:1 F2F.F32.F16 loadA3, loadA3; + }; + + +01:-:-:-:1 STS.128 [writeBs + 4x<0*128>], load0B; +--:-:-:-:6 IADD track0B0.CC, track0B0, ldb16; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +02:-:-:-:1 STS.128 [writeBs + 4x<4*128>], load1B; +--:-:-:-:6 IADD track1B0.CC, track1B0, ldb16; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +04:-:-:-:1 STS.128 [writeBs + 4x<8*128>], load2B; +--:-:-:-:6 IADD track2B0.CC, track2B0, ldb16; +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +08:-:-:-:1 STS.128 [writeBs + 4x<12*128>], load3B; +--:-:-:-:6 IADD track3B0.CC, track3B0, ldb16; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +10:-:-:-:1 STS [writeAs + 4x<0*32>], loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*32>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*32>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*32>], loadA3; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.64 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.64 load2B, [track2B]; +--:-:5:-:1 @P5 LDG.E.CI.64 load3B, [track3B]; +--:-:6:-:1 @P6 LDG.E.CI.64 loadA, [trackA]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n", + j9c6 => "10:5:-:-:1 \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n", + j11c6 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<3*32>], loadA3;\n", + j11c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], loadA2;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], loadA1;\n", + j11c12 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<0*32>], loadA0;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0B0.CC, track0B0, ldb16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0B1, track0B1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, ldb16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2B0.CC, track2B0, ldb16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2B1, track2B1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3B0.CC, track3B0, ldb16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3B1, track3B1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackA0.CC, trackA0, 2x<16>;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackA1, trackA1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.64 load0B, [track0B];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1B, [track1B];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.64 load2B, [track2B];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.64 load3B, [track3B];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.64 loadA, [trackA];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1B3, load1B1.H1;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B2, load1B1.H0;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B1, load1B0.H1;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j6c13 => "10:-:-:-:1 \@P5 F2F.F32.F16 load2B3, load2B1.H1;\n", + j6c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2B2, load2B1.H0;\n", + j6c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2B1, load2B0.H1;\n", + j6c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load2B0, load2B0.H0;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B3, load3B1.H1;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B2, load3B1.H0;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B1, load3B0.H1;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadA3, loadA1.H1;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA2, loadA1.H0;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA1, loadA0.H1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA0, loadA0.H0;\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0B0, load0B0;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B1, load0B1;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B2, load0B2;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0B3, load0B3;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1B0, load1B0;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B1, load1B1;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B2, load1B2;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1B3, load1B3;\n", + + j6c13 => "10:-:-:-:1 \@P5 F2F.F32.F16 load2B0, load2B0;\n", + j6c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2B1, load2B1;\n", + j6c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2B2, load2B2;\n", + j6c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load2B3, load2B3;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B0, load3B0;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B1, load3B1;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B2, load3B2;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3B3, load3B3;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadA3, loadA3;\n", + j10c17 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA2, loadA2;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA1, loadA1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA0, loadA0;\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_32x64.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_32x64.sass new file mode 100644 index 0000000..56b813f --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nn_32x64.sass @@ -0,0 +1,913 @@ +# Kernel: hgemm_nn_32x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<32*33*2 + 64*32*2> + szShareA : (32*33) + szShareB : (64*32) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24>, tid1, tid32, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta, xmad_tb + + 96-119 : load0A<0-7>, load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + 120-129 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 130-137 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb32 + 138-144 ~ tid, blkA, blkB, blkZ, writeCs, preds + + 0-15 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3> + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + 96-99 : loadC<0-3> + 100-103 : b<0-3> + 104-107 : c<0-3> + 108-109 : C<0-1> + 110-137 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15, tid16 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb8, ldb, 3; +--:-:-:-:1 SHL ldb32, ldb, 6; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 3 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +--:-:-:-:1 LOP.AND tidAY, tid, 3; +--:-:-:-:1 SHL shiftAX, tidAY, 3; +--:-:-:-:1 SHL tidAY, tidAY, 3; + +// tidBX = (tid & 15) << 2 +// tidBY = tid >> 4 +01:-:-:-:1 LOP.AND tidBX, tid, 15; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 4; + +--:-:-:-:1 IADD tidBY8, tidBY, 8; +--:-:-:-:1 IADD tidBY16, tidBY, 16; +--:-:-:-:1 IADD tidBY24, tidBY, 24; + +// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa, param_m, PT; + +// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA track0B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track1B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track2B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track3B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb, param_B[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P3, PT, txb, param_n, PT; +[+ + our $vec; + return $vec ? '' : q{ +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb3, param_n, PT; + }; ++] +--:-:-:-:1 P2R preds, PR, RZ, 0x7c; + +// writeAs = (tidAY*32 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (tidBY*64 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 2) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 2 bits at position 1 +--:-:-:-:1 SHL readBs, readBs, 4; + +// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5) +// tid32 = tid & -32 +--:-:-:-:1 LOP.AND tid32, tid, -32; + +// Write out the 4 groups of 32 rows 16 at a time +// writeCs = (readAs + tid32/2*4) * 64 + readBs +--:-:-:-:1 ISCADD writeCs, tid32, readAs, 1; +--:-:-:-:1 ISCADD writeCs, writeCs, readBs, 6; + +// Each block of 32 threads works on 8 lines, +// readAs is also shifted over by 8 for each group of 32 threads +// readAs += tid32/4 * 32 * 4 + tid32/4 * 4 +// readBs += tid32/4 * 64 * 4 + 4x +--:-:-:-:1 ISCADD readAs, tid32, readAs, 5; +--:-:-:-:1 ISCADD readBs, tid32, readBs, 6; +--:-:-:-:1 IADD readAs, tid32, readAs; +--:-:-:-:1 IADD readBs, readBs, 4x; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// If k is not a multiple of 32 we want to grab the partial amount on the first fetch. +// If it is a multiple of 32 then make a full 32 line fetch. +--:-:-:-:1 LOP.AND.Z P0, partialK, k, 31; +--:-:-:-:1 @P0 MOV partialK, 32; +--:-:-:-:1 IADD k, k, -partialK; +[+ + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY8, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY16, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY24, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidBY, partialK, P3; + +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.64 load0B, [track0B]; +--:-:4:-:1 @P4 LDG.E.CI.64 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.64 load2B, [track2B]; +--:-:6:-:1 @P6 LDG.E.CI.64 load3B, [track3B]; + + +--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.64 load0B, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.64 load1B, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.64 load2B, [addr_zero]; +--:-:1:-:1 @!P6 LDS.U.64 load3B, [addr_zero]; + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY1, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY2, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY3, partialK, P2; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:2:-:1 @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load0A0, RZ; +--:-:-:-:1 @!P4 MOV load0A1, RZ; +--:-:-:-:1 @!P5 MOV load0A2, RZ; +--:-:-:-:1 @!P6 MOV load0A3, RZ; + +--:-:-:-:1 IADD tidAY, tidAY, 4; +--:-:-:-:1 IADD tidAY1, tidAY1, 4; +--:-:-:-:1 IADD tidAY2, tidAY2, 4; +--:-:-:-:1 IADD tidAY3, tidAY3, 4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY1, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY2, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY3, partialK, P2; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @!P3 MOV load0A4, RZ; +--:-:-:-:1 @!P4 MOV load0A5, RZ; +--:-:-:-:1 @!P5 MOV load0A6, RZ; +--:-:-:-:1 @!P6 MOV load0A7, RZ; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load0B0, RZ; +--:-:-:-:1 @!P4 MOV load0B1, RZ; +--:-:-:-:1 @!P5 MOV load0B2, RZ; +--:-:-:-:1 @!P6 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY8, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x78; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load1B0, RZ; +--:-:-:-:1 @!P4 MOV load1B1, RZ; +--:-:-:-:1 @!P5 MOV load1B2, RZ; +--:-:-:-:1 @!P6 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY16, partialK, PT; +--:-:-:-:1 @P2 R2P PR, preds, 0x78; +--:-:-:-:1 @!P2 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load2B0, RZ; +--:-:-:-:1 @!P4 MOV load2B1, RZ; +--:-:-:-:1 @!P5 MOV load2B2, RZ; +--:-:-:-:1 @!P6 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY24, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load3B0, RZ; +--:-:-:-:1 @!P4 MOV load3B1, RZ; +--:-:-:-:1 @!P5 MOV load3B2, RZ; +--:-:-:-:1 @!P6 MOV load3B3, RZ; + + }; ++] +// partialB = partialK * ldb +--:-:-:-:1 XMAD.LO2 partialB, ldb, partialK, RZ; + +--:-:-:-:1 ISETP.GE.AND P1, PT, k, 32, PT; +--:-:-:-:1 IADD k, k, -32; +--:-:-:-:1 @P1 R2P PR, preds, 0x7c; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c; + + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:1 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A2.H1; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0.H0; + } : q{ +02:-:-:-:1 F2F.F32.F16 load0A7, load0A7; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A6; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A5; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A4; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A3; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0; + }; ++] +--:-:-:-:0 LEA track0A0.CC, partialK, track0A0, 1; +01:-:-:-:1 STS [writeAs + 4x<7*32>], load0A7; +--:-:-:-:1 STS [writeAs + 4x<6*32>], load0A6; +--:-:-:-:1 STS [writeAs + 4x<5*32>], load0A5; +--:-:-:-:1 STS [writeAs + 4x<4*32>], load0A4; +02:-:-:-:1 STS [writeAs + 4x<3*32>], load0A3; +--:-:-:-:1 STS [writeAs + 4x<2*32>], load0A2; +--:-:-:-:1 STS [writeAs + 4x<1*32>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<0*32>], load0A0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +[+ + our $vec; + return $vec ? q{ +04:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:3:-:1 F2F.F32.F16 load0B0, load0B0.H0; + +08:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:4:-:1 F2F.F32.F16 load1B0, load1B0.H0; + +10:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:5:-:1 F2F.F32.F16 load2B0, load2B0.H0; + +20:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:6:-:1 F2F.F32.F16 load3B0, load3B0.H0; + } : q{ +04:-:-:-:1 F2F.F32.F16 load0B0, load0B0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:3:-:1 F2F.F32.F16 load0B3, load0B3; + +08:-:-:-:1 F2F.F32.F16 load1B0, load1B0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:4:-:1 F2F.F32.F16 load1B3, load1B3; + +10:-:-:-:1 F2F.F32.F16 load2B0, load2B0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:5:-:1 F2F.F32.F16 load2B3, load2B3; + +20:-:-:-:1 F2F.F32.F16 load3B0, load3B0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:6:-:1 F2F.F32.F16 load3B3, load3B3; + }; ++] + +--:-:-:-:0 LEA track0B0.CC, partialB, track0B0, 1; +04:-:-:-:6 STS.128 [writeBs + 4x<0*64>], load0B; +--:-:-:-:1 IADD.X track0B1, track0B1, RZ; + +--:-:-:-:0 LEA track1B0.CC, partialB, track1B0, 1; +08:-:-:-:6 STS.128 [writeBs + 4x<8*64>], load1B; +--:-:-:-:1 IADD.X track1B1, track1B1, RZ; + +--:-:-:-:0 LEA track2B0.CC, partialB, track2B0, 1; +10:-:-:-:6 STS.128 [writeBs + 4x<16*64>], load2B; +--:-:-:-:1 IADD.X track2B1, track2B1, RZ; + +--:-:-:-:0 LEA track3B0.CC, partialB, track3B0, 1; +20:-:-:-:6 STS.128 [writeBs + 4x<24*64>], load3B; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +[+ + our $vec; + return $vec ? q{ +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.64 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:5:-:1 @P3 LDG.E.CI.64 load2B, [track2B]; +--:-:6:-:1 @P3 LDG.E.CI.64 load3B, [track3B]; + } : q{ +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + }; ++] + +LOOP: + +[+ + our $vec; + our %insert = + ( + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, RZ, PT;\n", + j0c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, k, 32, PT;\n" . + "--:-:-:-:1 IADD k, k, -32;\n", + + j0c23 => "--:-:-:-:1 \@P1 R2P PR, preds, 0x7c;\n", + j0c24 => "--:-:-:-:1 \@!P1 R2P PR, RZ, 0x7c;\n", + + j2c32 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<32>;\n", + j2c37 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j3c32 => "--:-:-:-:1 \@P3 IADD track0B0.CC, track0B0, ldb32;\n", + j3c37 => "--:-:-:-:1 \@P3 IADD.X track0B1, track0B1, RZ;\n", + j4c32 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, ldb32;\n", + j4c37 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j5c32 => "--:-:-:-:1 \@P3 IADD track2B0.CC, track2B0, ldb32;\n", + j5c37 => "--:-:-:-:1 \@P3 IADD.X track2B1, track2B1, RZ;\n", + j6c32 => "--:-:-:-:1 \@P3 IADD track3B0.CC, track3B0, ldb32;\n", + j6c37 => "--:-:-:-:1 \@P3 IADD.X track3B1, track3B1, RZ;\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j2c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<7*32>], load0A7;\n", + j2c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32>], load0A6;\n", + j2c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32>], load0A5;\n", + j2c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*32>], load0A4;\n", + j2c24 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*32>], load0A3;\n", + j2c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], load0A2;\n", + j2c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], load0A1;\n", + j2c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<0*32>], load0A0;\n", + + j3c16 => "04:3:-:-:1 \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n", + j4c16 => "08:4:-:-:1 \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n", + j5c16 => "10:5:-:-:1 \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n", + j6c16 => "20:6:-:-:1 \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n", + + ($vec ? + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n", + j1c63 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j2c51 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j3c51 => "08:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j4c51 => "10:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n", + + j5c51 => "20:-:-:-:1 \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j2c61 => "02:-:2:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j3c61 => "04:-:3:-:1 \@P3 LDG.E.CI.64 load0B, [track0B];\n", + j4c61 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1B, [track1B];\n", + j5c61 => "10:-:5:-:1 \@P3 LDG.E.CI.64 load2B, [track2B];\n", + j6c61 => "20:-:6:-:1 \@P3 LDG.E.CI.64 load3B, [track3B];\n", + ) : + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A0, load0A0;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A1;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A2;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A3;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A4;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A5;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A6;\n", + j1c63 => "--:2:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A7;\n", + + j2c51 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B1;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B2;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load0B3, load0B3;\n", + + j3c51 => "08:-:-:-:1 \@P0 F2F.F32.F16 load1B0, load1B0;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B1;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B2;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load1B3, load1B3;\n", + + j4c51 => "10:-:-:-:1 \@P0 F2F.F32.F16 load2B0, load2B0;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B1;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B2;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load2B3, load2B3;\n", + + j5c51 => "20:-:-:-:1 \@P0 F2F.F32.F16 load3B0, load3B0;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B1;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B2;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load3B3, load3B3;\n", + + j2c48 => "02:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j2c50 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j2c52 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j2c54 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + j2c56 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n", + j2c58 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n", + j2c60 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n", + j2c62 => "--:-:2:-:1 \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n", + + j3c56 => "04:-:-:-:1 \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j3c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j3c62 => "--:-:3:-:1 \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + + j4c56 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j4c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j4c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j4c62 => "--:-:4:-:1 \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + + j5c56 => "10:-:-:-:1 \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j5c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j5c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j5c62 => "--:-:5:-:1 \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + + j6c56 => "20:-:-:-:1 \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j6c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j6c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j6c62 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + ) + ), + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out = ''; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHR.U32 tid16, tid, 4; +--:-:-:-:1 SHL tid15, tid15, 2; +--:-:-:-:1 ISCADD readCs, tid16, tid15, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*64 + tid15; +--:-:-:-:1 ISCADD cx, blkB, tid15, 6; +--:-:-:-:1 IADD cx1, cx, 1; +--:-:-:-:1 IADD cx2, cx, 2; +--:-:-:-:1 IADD cx3, cx, 3; + +// cy = blkA*32 + tid16 +--:-:-:-:1 ISCADD cy, blkA, tid16, 5; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 SHL ldc8, ldc, 4; + +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C1, ci, param_C[1], RZ, 1; + +// P0 = cx < n +--:-:-:-:1 ISETP.LT.AND P0, PT, cx, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, cx1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, cx2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, cx3, param_n, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +// P4 = cy < m +--:-:-:-:1 ISETP.LT.AND P4, PT, cy, param_m, PT; + +// P5 = beta != 0 && P4 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P4; + +// P6 = Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; + +// Init beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:5 EXIT; + +STORE_C: + +[+ + our $vec; + return $vec ? q{ +--:-:1:-:1 @P0 LDG.E.64 loadC, [C]; + } : q{ +--:-:-:-:0 @!P0 MOV loadC0, RZ; +--:-:-:-:1 @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>]; +--:-:-:-:0 @!P1 MOV loadC1, RZ; +--:-:-:-:1 @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>]; +--:-:-:-:0 @!P2 MOV loadC2, RZ; +--:-:-:-:1 @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>]; +--:-:-:-:0 @!P3 MOV loadC3, RZ; +--:-:1:-:1 @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>]; + }; ++] + +// Restore output preds +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 LDS.U.128 part0C, [readCs + 4x< 0*64>]; +--:-:2:-:1 LDS.U.128 part1C, [readCs + 4x<16*64>]; +--:-:-:-:1 LDS.U.128 part2C, [readCs + 4x<32*64>]; +--:-:3:-:1 LDS.U.128 part3C, [readCs + 4x<48*64>]; + + +02:-:-:-:1 @P0 FADD part0C0, part0C0, part1C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part1C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part1C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part1C3; + +04:-:-:-:1 @P0 FADD part2C0, part2C0, part3C0; +--:-:-:-:1 @P1 FADD part2C1, part2C1, part3C1; +--:-:-:-:1 @P2 FADD part2C2, part2C2, part3C2; +--:-:-:-:1 @P3 FADD part2C3, part2C3, part3C3; + +--:-:-:-:1 @P0 FADD c0, part0C0, part2C0; +--:-:-:-:1 @P1 FADD c1, part0C1, part2C1; +--:-:-:-:1 @P2 FADD c2, part0C2, part2C2; +--:-:-:-:1 @P3 FADD c3, part0C3, part2C3; + + +--:-:-:-:0 IADD cy, cy, 8; + +[+ + our $vec; + return $vec ? q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0.H0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC0.H1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC1.H0; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC1.H1; + } : q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC2; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC3; + }; ++] + +01:-:-:-:1 @P5 FFMA c0, b0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, b1, beta, c1; +04:-:-:-:1 @P5 FFMA c2, b2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, b3, beta, c3; + +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:0 ISETP.LT.AND P5, PT, cy, param_m, P5; + +--:-:1:-:1 @P0 F2F.F16.F32 c0, c0; +--:-:2:-:1 @P1 F2F.F16.F32 c1, c1; + +--:-:-:-:0 ISETP.LT.AND P4, PT, cy, param_m, PT; + +--:-:3:-:1 @P2 F2F.F16.F32 c2, c2; + +--:-:-:-:0 LOP.XOR readCs, readCs, 4x<8*64>; + +--:-:4:-:1 @P3 F2F.F16.F32 c3, c3; + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:2 @P0 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 @P0 BFI c1, c3, 0x1010, c2; + +--:1:-:-:1 @P0 STG.E.CG.64 [C], c; + } : q{ +01:-:-:-:1 @P0 STG.E.U16 [C + 2x<0>], c0; +02:-:-:-:1 @P1 STG.E.U16 [C + 2x<1>], c1; +04:-:-:-:1 @P2 STG.E.U16 [C + 2x<2>], c2; +08:1:-:-:1 @P3 STG.E.U16 [C + 2x<3>], c3; + }; ++] + +// Restore beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +01:-:-:-:6 IADD C0.CC, C0, ldc8; +--:-:-:-:0 IADD.X C1, C1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_nt_128x128.sass b/Kernel/SGEMM/Maxwell/hgemm_nt_128x128.sass new file mode 100644 index 0000000..29a50f0 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nt_128x128.sass @@ -0,0 +1,400 @@ +# Kernel: hgemm_nt_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + +our $int16; + +our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + +sub convert_in {return $convert;} + + +sub int16_params { + return $int16 ? q{ +param_Stats[0] : c[0x0][0x190] +param_Stats[1] : c[0x0][0x194] +param_scale : c[0x0][0x198] + } : ""; +} +-] + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + [+ int16_params() +] + + + + + 64-95 ~ tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid7, tid128, tid127, txa, txb, xmad_ta, xmad_tb, k1, k2, k3 + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-107 : loadA<0-5>, loadB<0-5> + + 108-111 : trackA<0-1>, trackB<0-1> + + 112-118 ~ writeS, k, tidY, ta, tb, loop + 119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ + + 64-75 ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128 + + 64-79 : c<0-7>, d3, d2, d1, d0, cs<0-3> + 64-65 : Stats<0-1> + 80-89 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 90-118 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 MOV loop, RZ; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); ++] + +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// tidY = tid1 << 2 +--:-:-:-:1 SHL tidY, tid1, 2; + +// tidX = tid >> 1 +01:-:-:-:1 SHR.U32 tidX, tid, 1; + +// trackA += 2 * ((blkA*128 + tidX) * lda + tidY) +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x1; + +// trackB += 2 * ((blkB*128 + tidX) * ldb + tidY) +04:-:-:-:1 ISCADD txb, blkB, tidX, 7; +--:-:-:-:1 XMAD.LO tb, ldb, txb, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = 4 * (128 * tidY + tidX) +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeS, writeS, 4x<128*8*2>, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + + +REMAINDER: + +[+ + our $vec; + return $vec ? q{ +// k must be multiple of 8 +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +--:-:2:-:1 @P5 LDG.E.CI.64 loadA0, [trackA + 2x<0>]; +--:-:2:-:1 @P5 LDG.E.CI.64 loadA4, [trackA + 2x<8>]; +--:-:4:-:1 @P6 LDG.E.CI.64 loadB0, [trackB + 2x<0>]; +--:5:6:-:1 @P6 LDG.E.CI.64 loadB4, [trackB + 2x<8>]; + +--:-:3:-:1 @!P5 LDS.U.64 loadA0, [addr_zero]; +--:-:3:-:1 @!P5 LDS.U.64 loadA4, [addr_zero]; +--:-:3:-:1 @!P6 LDS.U.64 loadB0, [addr_zero]; +--:-:3:-:2 @!P6 LDS.U.64 loadB4, [addr_zero]; + + // Vec 4 and scalar loads + } : q{ + +--:-:-:-:1 IADD k1, tidY, 1; +--:-:-:-:1 IADD k2, tidY, 2; +--:-:-:-:1 IADD k3, tidY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P6; + +--:-:4:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:1 LOP.AND.NZ P4, RZ, k, 7; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 8, P4; + + }; ++] + +[+ + our $vec; + our $convert; + return $vec ? qq{ + +06:-:1:-:4 $convert loadA3, loadA1.H1; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<16>; +--:-:2:-:4 $convert loadA2, loadA1.H0; +--:-:-:-:4 $convert loadA1, loadA0.H1; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; +--:-:3:-:1 $convert loadA0, loadA0.H0; + +01:-:-:-:1 STS [writeS + 4x<3*128>], loadA3; +02:-:-:-:1 STS [writeS + 4x<2*128>], loadA2; +04:-:-:-:1 STS [writeS + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeS + 4x<0*128>], loadA0; + +08:-:1:-:4 $convert loadB3, loadB1.H1; +10:-:-:-:0 IADD trackB0.CC, trackB0, 2x<16>; +--:-:2:-:4 $convert loadB2, loadB1.H0; +--:-:3:-:4 $convert loadB1, loadB0.H1; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; +--:-:4:-:1 $convert loadB0, loadB0.H0; + +01:-:-:-:1 STS [writeS + 4x<11*128>], loadB3; +02:-:-:-:1 STS [writeS + 4x<10*128>], loadB2; +04:-:-:-:1 STS [writeS + 4x< 9*128>], loadB1; +08:-:-:-:1 STS [writeS + 4x< 8*128>], loadB0; + + // scalar loads + } : qq{ + +02:-:-:-:4 $convert loadA0, loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<8>; +--:-:2:-:4 $convert loadA1, loadA1; +--:-:-:-:4 $convert loadA2, loadA2; +--:-:3:-:1 $convert loadA3, loadA3; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS [writeS + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeS + 4x<1*128>], loadA1; +04:-:-:-:1 STS [writeS + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeS + 4x<3*128>], loadA3; + +08:-:-:-:4 $convert loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, 2x<8>; +--:-:2:-:4 $convert loadB1, loadB1; +--:-:-:-:4 $convert loadB2, loadB2; +--:-:3:-:1 $convert loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +02:-:-:-:1 STS [writeS + 4x< 8*128>], loadB0; +--:-:-:-:1 STS [writeS + 4x< 9*128>], loadB1; +04:-:-:-:1 STS [writeS + 4x<10*128>], loadB2; +--:-:-:-:1 STS [writeS + 4x<11*128>], loadB3; + }; ++] + + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<128*8*2>; + + +[+ + our $vec; + our $convert; + our @top = $vec ? + ("--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n") : + ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, 16, P5;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + j0c13 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P5;\n", + j0c15 => "--:-:-:-:1 PSETP.AND.AND P3, PT, P0, P1, P6;\n", + + j0c27 => "--:-:2:-:1 \@P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];\n", + j0c29 => "--:-:2:-:1 \@P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];\n", + j0c31 => "--:-:4:-:1 \@P3 LDG.E.CI.64 loadB0, [trackB + 2x<0>];\n", + j0c33 => "20:5:6:-:1 \@P3 LDG.E.CI.64 loadB4, [trackB + 2x<8>];\n", + + j3c5 => "--:-:-:-:1 \@!P1 $convert loadA3, loadA5.H1;\n", + j3c9 => "--:-:-:-:1 \@!P1 $convert loadA2, loadA5.H0;\n", + j3c13 => "--:-:-:-:1 \@!P1 $convert loadA1, loadA4.H1;\n", + j3c17 => "--:-:-:-:1 \@!P1 $convert loadA0, loadA4.H0;\n", + + j4c5 => "--:-:-:-:1 \@!P1 $convert loadB3, loadB5.H1;\n", + j4c9 => "--:-:-:-:1 \@!P1 $convert loadB2, loadB5.H0;\n", + j4c13 => "--:-:-:-:1 \@!P1 $convert loadB1, loadB4.H1;\n", + j4c17 => "--:-:-:-:1 \@!P1 $convert loadB0, loadB4.H0;\n", + + j5c5 => "02:-:-:-:1 \@P1 $convert loadA3, loadA1.H1;\n", + j5c9 => "--:-:2:-:1 \@P1 $convert loadA2, loadA1.H0;\n", + j5c13 => "--:-:-:-:1 \@P1 $convert loadA1, loadA0.H1;\n", + j5c17 => "--:-:3:-:1 \@P1 $convert loadA0, loadA0.H0;\n", + + j5c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x<3*128>], loadA3;\n", + j5c31 => "--:-:-:-:1 \@P0 STS [writeS + 4x<2*128>], loadA2;\n", + j5c33 => "04:-:-:-:1 \@P0 STS [writeS + 4x<1*128>], loadA1;\n", + j5c35 => "--:-:-:-:1 \@P0 STS [writeS + 4x<0*128>], loadA0;\n", + + j6c5 => "08:-:-:-:1 \@P1 $convert loadB3, loadB1.H1;\n", + j6c9 => "--:-:2:-:1 \@P1 $convert loadB2, loadB1.H0;\n", + j6c13 => "--:-:3:-:1 \@P1 $convert loadB1, loadB0.H1;\n", + j6c17 => "--:-:4:-:1 \@P1 $convert loadB0, loadB0.H0;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x<11*128>], loadB3;\n", + j6c31 => "--:-:-:-:1 \@P0 STS [writeS + 4x<10*128>], loadB2;\n", + j6c33 => "04:-:-:-:1 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n", + j6c35 => "08:-:-:-:1 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 2x<16>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c46 => "10:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, 2x<16>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : + ( + j0c7 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 16, P6;\n", + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n", + j0c12 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n", + j0c14 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n", + j0c16 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n", + + j0c29 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j0c31 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j0c33 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j0c35 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j5c5 => "02:-:2:-:1 \@P2 $convert loadA0, loadA0;\n", + j5c9 => "--:-:3:-:1 \@P2 $convert loadA1, loadA1;\n", + j5c13 => "--:-:4:-:1 \@P2 $convert loadA2, loadA2;\n", + j5c17 => "--:-:5:-:1 \@P2 $convert loadA3, loadA3;\n", + + j5c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x<0*128>], loadA0;\n", + j5c31 => "04:-:-:-:1 \@P0 STS [writeS + 4x<1*128>], loadA1;\n", + j5c33 => "08:-:-:-:1 \@P0 STS [writeS + 4x<2*128>], loadA2;\n", + j5c35 => "10:-:-:-:1 \@P0 STS [writeS + 4x<3*128>], loadA3;\n", + + j6c5 => "20:-:2:-:1 \@P3 $convert loadB0, loadB0;\n", + j6c9 => "--:-:3:-:1 \@P3 $convert loadB1, loadB1;\n", + j6c13 => "--:-:4:-:1 \@P3 $convert loadB2, loadB2;\n", + j6c17 => "--:-:5:-:1 \@P3 $convert loadB3, loadB3;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n", + j6c31 => "04:-:-:-:1 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n", + j6c33 => "08:-:-:-:1 \@P0 STS [writeS + 4x<10*128>], loadB2;\n", + j6c35 => "10:-:-:-:1 \@P0 STS [writeS + 4x<11*128>], loadB3;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 2x<8>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c46 => "10:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, 2x<8>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ) + ), + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" . + "--:-:-:-:1 IADD32I k, k, -8;\n", + ); + return; ++] + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_nt_16x64.sass b/Kernel/SGEMM/Maxwell/hgemm_nt_16x64.sass new file mode 100644 index 0000000..ce5e6ef --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nt_16x64.sass @@ -0,0 +1,1185 @@ +# Kernel: hgemm_nt_16x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(16*64 + 32)*2 + (64*64 + 32)*2> + szShareA : (16*64 + 32) + szShareB : (64*64 + 32) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 64-95 ~ tidX, tidY, tidY<1-3>, lda, ldb, ldaz, ldbz, ldb16, tid16_8, ta, txa, tb<00|16|32|48>, txb<00|16|32|48>, xmad_ta, xmad_tb, shiftX, predsY0, predsY4, partialK + + 96-135 : load0A<0-7>, load0B<0-7>, load1B<0-7>, load2B<0-7>, load3B<0-7> + 136-145 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 146-152 ~ swapBuf, readAs, readBs, writeAs, writeBs, k + 153-159 ~ tid, blkA, blkB, blkZ, writeCs, preds, tid16 + + 0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3> + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + 96-99 : loadC<0-3> + 100-103 : b<0-3> + 104-107 : c<0-3> + 108-109 : C<0-1> + 110-152 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb16, ldb, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// tidX = tid >> 3 +// tidY = (tid & 7) << 3 +// shiftX = (tid & 7) << 2 +01:-:-:-:1 SHR.U32 tidX, tid, 3; +--:-:-:-:1 LOP.AND tidY, tid, 7; +--:-:-:-:1 SHL shiftX, tidY, 2; +--:-:-:-:1 SHL tidY, tidY, 3; + +// trackA += ((blkA*16 + tidX) * lda + tidY) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidX, 4; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa, param_m, PT; + +// trackB += ((blkB*64 + tidX) * ldb + tidY) * 2 +04:-:-:-:1 ISCADD txb00, blkB, tidX, 6; +--:-:-:-:1 IADD txb16, txb00, 16; +--:-:-:-:1 IADD txb32, txb00, 32; +--:-:-:-:1 IADD txb48, txb00, 48; +--:-:-:-:1 XMAD.LO tb00, ldb, txb00, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb00, ldbz, blkZ, tb00; +--:-:-:-:1 IADD tb16, tb00, ldb16; +--:-:-:-:1 IADD tb32, tb16, ldb16; +--:-:-:-:1 IADD tb48, tb32, ldb16; +--:-:-:-:1 LEA track0B0.CC, tb00, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb00, param_B[1], RZ, 1; +--:-:-:-:1 LEA track1B0.CC, tb16, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb16, param_B[1], RZ, 1; +--:-:-:-:1 LEA track2B0.CC, tb32, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb32, param_B[1], RZ, 1; +--:-:-:-:1 LEA track3B0.CC, tb48, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb48, param_B[1], RZ, 1; + + +--:-:-:-:1 ISETP.LT.AND P3, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb16, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb48, param_n, PT; + +--:-:-:-:1 P2R preds, PR, RZ, 0x7c; + +// writeAs = (tidY*16 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 4; +--:-:-:-:1 IADD writeAs, writeAs, shiftX; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (tidY*64 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 6; +--:-:-:-:1 IADD writeBs, writeBs, shiftX; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (tid & 1) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHL readBs, readBs, 4; + +// tid16 = tid & -16 +// tid16_8 = tid16 / 2 * 4 +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHL tid16_8, tid16, 1; + +// writeCs = (readAs + tid16*2) * 64 + readBs; +--:-:-:-:1 ISCADD writeCs, tid16, readAs, 1; +--:-:-:-:1 ISCADD writeCs, writeCs, readBs, 6; + +// Each block of 16 threads works on 8 lines, shifted over by 4 +// readAs += tid16_8 * 16 + tid16 +// readBs += tid16_8 * 64 + tid16 + 4x +--:-:-:-:1 ISCADD readAs, tid16_8, readAs, 4; +--:-:-:-:1 ISCADD readBs, tid16_8, readBs, 6; +--:-:-:-:1 IADD readAs, tid16, readAs; +--:-:-:-:1 IADD3 readBs, tid16, 4x, readBs; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// If k is not a multiple of 64 we want to grab the partial amount on the first fetch. +// If it is a multiple of 64 then make a full 64 line fetch. +--:-:-:-:1 LOP.AND.Z P0, partialK, k, 63; +--:-:-:-:1 @P0 MOV partialK, 64; +--:-:-:-:1 IADD k, k, -partialK; +[+ + our $vec; + return $vec ? q{ + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x7c; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c; + + +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.128 load0B, [track0B]; +--:-:4:-:1 @P4 LDG.E.CI.128 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load2B, [track2B]; +--:-:6:-:1 @P6 LDG.E.CI.128 load3B, [track3B]; + + + +--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.128 load0B, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 load1B, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.128 load2B, [addr_zero]; +--:-:1:-:1 @!P6 LDS.U.128 load3B, [addr_zero]; + + + } : q{ +--:-:-:-:1 IADD tidY1, tidY, 1; +--:-:-:-:1 IADD tidY2, tidY, 2; +--:-:-:-:1 IADD tidY3, tidY, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, partialK, PT; +--:-:-:-:1 P2R predsY0, PR, RZ, 0x0f; + +--:-:-:-:1 IADD tidY, tidY, 4; +--:-:-:-:1 IADD tidY1, tidY1, 4; +--:-:-:-:1 IADD tidY2, tidY2, 4; +--:-:-:-:1 IADD tidY3, tidY3, 4; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, partialK, PT; +--:-:-:-:1 P2R predsY4, PR, RZ, 0x0f; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa, param_m, PT; +--:-:-:-:1 @P4 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 @P4 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load0A4, RZ; +--:-:-:-:1 @!P1 MOV load0A5, RZ; +--:-:-:-:1 @!P2 MOV load0A6, RZ; +--:-:-:-:1 @!P3 MOV load0A7, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb00, param_n, PT; +--:-:-:-:1 @P5 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 @P5 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0B4, [track0B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0B5, [track0B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0B6, [track0B + 2x<6>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load0B4, RZ; +--:-:-:-:1 @!P1 MOV load0B5, RZ; +--:-:-:-:1 @!P2 MOV load0B6, RZ; +--:-:-:-:1 @!P3 MOV load0B7, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb16, param_n, PT; +--:-:-:-:1 @P6 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 @P6 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1B4, [track1B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1B5, [track1B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1B6, [track1B + 2x<6>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B7, [track1B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load1B4, RZ; +--:-:-:-:1 @!P1 MOV load1B5, RZ; +--:-:-:-:1 @!P2 MOV load1B6, RZ; +--:-:-:-:1 @!P3 MOV load1B7, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb32, param_n, PT; +--:-:-:-:1 @P4 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 @P4 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load2B4, [track2B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load2B5, [track2B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load2B6, [track2B + 2x<6>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 load2B7, [track2B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load2B4, RZ; +--:-:-:-:1 @!P1 MOV load2B5, RZ; +--:-:-:-:1 @!P2 MOV load2B6, RZ; +--:-:-:-:1 @!P3 MOV load2B7, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb48, param_n, PT; +--:-:-:-:1 @P6 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:6:-:1 @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 @P6 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load3B4, [track3B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load3B5, [track3B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load3B6, [track3B + 2x<6>]; +--:-:6:-:1 @P3 LDG.E.CI.U16 load3B7, [track3B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load3B4, RZ; +--:-:-:-:1 @!P1 MOV load3B5, RZ; +--:-:-:-:1 @!P2 MOV load3B6, RZ; +--:-:-:-:1 @!P3 MOV load3B7, RZ; + }; ++] +--:-:-:-:1 SHL partialK, partialK, 1; + +--:-:-:-:1 ISETP.GE.AND P0, PT, k, 64, PT; +--:-:-:-:1 IADD k, k, -64; +--:-:-:-:1 @P0 R2P PR, preds, 0x7c; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x7c; + + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:1 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A2.H1; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0.H0; + } : q{ +02:-:-:-:1 F2F.F32.F16 load0A7, load0A7; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A6; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A5; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A4; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A3; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0; + }; ++] +--:-:-:-:0 IADD track0A0.CC, track0A0, partialK; +01:-:-:-:1 STS [writeAs + 4x<7*16>], load0A7; +--:-:-:-:1 STS [writeAs + 4x<6*16>], load0A6; +--:-:-:-:1 STS [writeAs + 4x<5*16>], load0A5; +--:-:-:-:1 STS [writeAs + 4x<4*16>], load0A4; +02:-:-:-:1 STS [writeAs + 4x<3*16>], load0A3; +--:-:-:-:1 STS [writeAs + 4x<2*16>], load0A2; +--:-:-:-:1 STS [writeAs + 4x<1*16>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<0*16>], load0A0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +[+ + our $vec; + return $vec ? q{ +04:-:-:-:1 F2F.F32.F16 load0B7, load0B3.H1; +--:-:-:-:1 F2F.F32.F16 load0B6, load0B3.H0; +--:-:-:-:1 F2F.F32.F16 load0B5, load0B2.H1; +--:-:1:-:1 F2F.F32.F16 load0B4, load0B2.H0; +--:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:2:-:1 F2F.F32.F16 load0B0, load0B0.H0; + } : q{ +04:-:-:-:1 F2F.F32.F16 load0B7, load0B7; +--:-:-:-:1 F2F.F32.F16 load0B6, load0B6; +--:-:-:-:1 F2F.F32.F16 load0B5, load0B5; +--:-:1:-:1 F2F.F32.F16 load0B4, load0B4; +--:-:-:-:1 F2F.F32.F16 load0B3, load0B3; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:2:-:1 F2F.F32.F16 load0B0, load0B0; + }; ++] +--:-:-:-:0 IADD track0B0.CC, track0B0, partialK; +01:-:-:-:1 STS [writeBs + 4x<7*64 + 0*16>], load0B7; +--:-:-:-:1 STS [writeBs + 4x<6*64 + 0*16>], load0B6; +--:-:-:-:1 STS [writeBs + 4x<5*64 + 0*16>], load0B5; +--:-:-:-:1 STS [writeBs + 4x<4*64 + 0*16>], load0B4; +02:-:-:-:1 STS [writeBs + 4x<3*64 + 0*16>], load0B3; +--:-:-:-:1 STS [writeBs + 4x<2*64 + 0*16>], load0B2; +--:-:-:-:1 STS [writeBs + 4x<1*64 + 0*16>], load0B1; +--:-:-:-:1 STS [writeBs + 4x<0*64 + 0*16>], load0B0; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +[+ + our $vec; + return $vec ? q{ +08:-:-:-:1 F2F.F32.F16 load1B7, load1B3.H1; +--:-:-:-:1 F2F.F32.F16 load1B6, load1B3.H0; +--:-:-:-:1 F2F.F32.F16 load1B5, load1B2.H1; +--:-:1:-:1 F2F.F32.F16 load1B4, load1B2.H0; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + } : q{ +08:-:-:-:1 F2F.F32.F16 load1B7, load1B7; +--:-:-:-:1 F2F.F32.F16 load1B6, load1B6; +--:-:-:-:1 F2F.F32.F16 load1B5, load1B5; +--:-:1:-:1 F2F.F32.F16 load1B4, load1B4; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B3; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0; + }; ++] +--:-:-:-:0 IADD track1B0.CC, track1B0, partialK; +01:-:-:-:1 STS [writeBs + 4x<7*64 + 1*16>], load1B7; +--:-:-:-:1 STS [writeBs + 4x<6*64 + 1*16>], load1B6; +--:-:-:-:1 STS [writeBs + 4x<5*64 + 1*16>], load1B5; +--:-:-:-:1 STS [writeBs + 4x<4*64 + 1*16>], load1B4; +02:-:-:-:1 STS [writeBs + 4x<3*64 + 1*16>], load1B3; +--:-:-:-:1 STS [writeBs + 4x<2*64 + 1*16>], load1B2; +--:-:-:-:1 STS [writeBs + 4x<1*64 + 1*16>], load1B1; +--:-:-:-:1 STS [writeBs + 4x<0*64 + 1*16>], load1B0; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +[+ + our $vec; + return $vec ? q{ +10:-:-:-:1 F2F.F32.F16 load2B7, load2B3.H1; +--:-:-:-:1 F2F.F32.F16 load2B6, load2B3.H0; +--:-:-:-:1 F2F.F32.F16 load2B5, load2B2.H1; +--:-:1:-:1 F2F.F32.F16 load2B4, load2B2.H0; +--:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:2:-:1 F2F.F32.F16 load2B0, load2B0.H0; + } : q{ +10:-:-:-:1 F2F.F32.F16 load2B7, load2B7; +--:-:-:-:1 F2F.F32.F16 load2B6, load2B6; +--:-:-:-:1 F2F.F32.F16 load2B5, load2B5; +--:-:1:-:1 F2F.F32.F16 load2B4, load2B4; +--:-:-:-:1 F2F.F32.F16 load2B3, load2B3; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:2:-:1 F2F.F32.F16 load2B0, load2B0; + }; ++] +--:-:-:-:0 IADD track2B0.CC, track2B0, partialK; +01:-:-:-:1 STS [writeBs + 4x<7*64 + 2*16>], load2B7; +--:-:-:-:1 STS [writeBs + 4x<6*64 + 2*16>], load2B6; +--:-:-:-:1 STS [writeBs + 4x<5*64 + 2*16>], load2B5; +--:-:-:-:1 STS [writeBs + 4x<4*64 + 2*16>], load2B4; +02:-:-:-:1 STS [writeBs + 4x<3*64 + 2*16>], load2B3; +--:-:-:-:1 STS [writeBs + 4x<2*64 + 2*16>], load2B2; +--:-:-:-:1 STS [writeBs + 4x<1*64 + 2*16>], load2B1; +--:-:-:-:1 STS [writeBs + 4x<0*64 + 2*16>], load2B0; +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +[+ + our $vec; + return $vec ? q{ +20:-:-:-:1 F2F.F32.F16 load3B7, load3B3.H1; +--:-:-:-:1 F2F.F32.F16 load3B6, load3B3.H0; +--:-:-:-:1 F2F.F32.F16 load3B5, load3B2.H1; +--:-:1:-:1 F2F.F32.F16 load3B4, load3B2.H0; +--:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:2:-:1 F2F.F32.F16 load3B0, load3B0.H0; + } : q{ +20:-:-:-:1 F2F.F32.F16 load3B7, load3B7; +--:-:-:-:1 F2F.F32.F16 load3B6, load3B6; +--:-:-:-:1 F2F.F32.F16 load3B5, load3B5; +--:-:1:-:1 F2F.F32.F16 load3B4, load3B4; +--:-:-:-:1 F2F.F32.F16 load3B3, load3B3; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:2:-:1 F2F.F32.F16 load3B0, load3B0; + }; ++] +--:-:-:-:0 IADD track3B0.CC, track3B0, partialK; +01:-:-:-:1 STS [writeBs + 4x<7*64 + 3*16>], load3B7; +--:-:-:-:1 STS [writeBs + 4x<6*64 + 3*16>], load3B6; +--:-:-:-:1 STS [writeBs + 4x<5*64 + 3*16>], load3B5; +--:-:-:-:1 STS [writeBs + 4x<4*64 + 3*16>], load3B4; +02:-:-:-:1 STS [writeBs + 4x<3*64 + 3*16>], load3B3; +--:-:-:-:1 STS [writeBs + 4x<2*64 + 3*16>], load3B2; +--:-:-:-:1 STS [writeBs + 4x<1*64 + 3*16>], load3B1; +--:-:-:-:1 STS [writeBs + 4x<0*64 + 3*16>], load3B0; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*16 + 00>]; +--:-:-:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*16 + 08>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +[+ + our $vec; + return $vec ? q{ +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.128 load0B, [track0B]; +--:-:4:-:1 @P4 LDG.E.CI.128 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load2B, [track2B]; +--:-:6:-:1 @P6 LDG.E.CI.128 load3B, [track3B]; + } : q{ +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B4, [track0B + 2x<4>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B5, [track0B + 2x<5>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B6, [track0B + 2x<6>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>]; + +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B4, [track1B + 2x<4>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B5, [track1B + 2x<5>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B6, [track1B + 2x<6>]; +--:-:4:-:1 @P4 LDG.E.CI.U16 load1B7, [track1B + 2x<7>]; + +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B4, [track2B + 2x<4>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B5, [track2B + 2x<5>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B6, [track2B + 2x<6>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B7, [track2B + 2x<7>]; + +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B4, [track3B + 2x<4>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B5, [track3B + 2x<5>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B6, [track3B + 2x<6>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load3B7, [track3B + 2x<7>]; + }; ++] + +LOOP: + +[+ + our $vec; + our %insert = + ( + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, RZ, PT;\n", + j0c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, k, 64, PT;\n" . + "--:-:-:-:1 IADD k, k, -64;\n", + + j0c23 => "--:-:-:-:1 \@P1 R2P PR, preds, 0x7c;\n", + j0c24 => "--:-:-:-:1 \@!P1 R2P PR, RZ, 0x7c;\n", + + j2c32 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<64>;\n", + j2c37 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j3c32 => "--:-:-:-:1 \@P3 IADD track0B0.CC, track0B0, 2x<64>;\n", + j3c37 => "--:-:-:-:1 \@P3 IADD.X track0B1, track0B1, RZ;\n", + j4c32 => "--:-:-:-:1 \@P4 IADD track1B0.CC, track1B0, 2x<64>;\n", + j4c37 => "--:-:-:-:1 \@P4 IADD.X track1B1, track1B1, RZ;\n", + j5c32 => "--:-:-:-:1 \@P5 IADD track2B0.CC, track2B0, 2x<64>;\n", + j5c37 => "--:-:-:-:1 \@P5 IADD.X track2B1, track2B1, RZ;\n", + j6c32 => "--:-:-:-:1 \@P6 IADD track3B0.CC, track3B0, 2x<64>;\n", + j6c37 => "--:-:-:-:1 \@P6 IADD.X track3B1, track3B1, RZ;\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n", + j1c63 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j2c36 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B7, load0B3.H1;\n", + j2c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B6, load0B3.H0;\n", + j2c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B5, load0B2.H1;\n", + j2c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B4, load0B2.H0;\n", + j2c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j3c36 => "08:-:-:-:1 \@P0 F2F.F32.F16 load1B7, load1B3.H1;\n", + j3c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B6, load1B3.H0;\n", + j3c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B5, load1B2.H1;\n", + j3c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B4, load1B2.H0;\n", + j3c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j4c36 => "10:-:-:-:1 \@P0 F2F.F32.F16 load2B7, load2B3.H1;\n", + j4c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B6, load2B3.H0;\n", + j4c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B5, load2B2.H1;\n", + j4c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B4, load2B2.H0;\n", + j4c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n", + + j5c36 => "20:-:-:-:1 \@P0 F2F.F32.F16 load3B7, load3B3.H1;\n", + j5c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B6, load3B3.H0;\n", + j5c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B5, load3B2.H1;\n", + j5c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B4, load3B2.H0;\n", + j5c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j2c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<7*16>], load0A7;\n", + j2c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*16>], load0A6;\n", + j2c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*16>], load0A5;\n", + j2c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*16>], load0A4;\n", + j2c24 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*16>], load0A3;\n", + j2c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*16>], load0A2;\n", + j2c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*16>], load0A1;\n", + j2c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<0*16>], load0A0;\n", + + j3c16 => "04:-:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 0*16>], load0B7;\n", + j3c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 0*16>], load0B6;\n", + j3c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 0*16>], load0B5;\n", + j3c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 0*16>], load0B4;\n", + j3c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 0*16>], load0B3;\n", + j3c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 0*16>], load0B2;\n", + j3c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 0*16>], load0B1;\n", + j3c30 => "--:3:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 0*16>], load0B0;\n", + + j4c16 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 1*16>], load1B7;\n", + j4c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 1*16>], load1B6;\n", + j4c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 1*16>], load1B5;\n", + j4c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 1*16>], load1B4;\n", + j4c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 1*16>], load1B3;\n", + j4c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 1*16>], load1B2;\n", + j4c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 1*16>], load1B1;\n", + j4c30 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 1*16>], load1B0;\n", + + j5c16 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 2*16>], load2B7;\n", + j5c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 2*16>], load2B6;\n", + j5c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 2*16>], load2B5;\n", + j5c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 2*16>], load2B4;\n", + j5c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 2*16>], load2B3;\n", + j5c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 2*16>], load2B2;\n", + j5c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 2*16>], load2B1;\n", + j5c30 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 2*16>], load2B0;\n", + + j6c16 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 3*16>], load3B7;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 3*16>], load3B6;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 3*16>], load3B5;\n", + j6c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 3*16>], load3B4;\n", + j6c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 3*16>], load3B3;\n", + j6c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 3*16>], load3B2;\n", + j6c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 3*16>], load3B1;\n", + j6c30 => "--:6:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 3*16>], load3B0;\n", + + j2c61 => "02:-:2:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j3c61 => "04:-:3:-:1 \@P3 LDG.E.CI.128 load0B, [track0B];\n", + j4c61 => "08:-:4:-:1 \@P4 LDG.E.CI.128 load1B, [track1B];\n", + j5c61 => "10:-:5:-:1 \@P5 LDG.E.CI.128 load2B, [track2B];\n", + j6c61 => "20:-:6:-:1 \@P6 LDG.E.CI.128 load3B, [track3B];\n", + ) : + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A0, load0A0;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A1;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A2;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A3;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A4;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A5;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A6;\n", + j1c63 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A7, load0A7;\n", + + j2c36 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0;\n", + j2c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B1;\n", + j2c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B2;\n", + j2c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B3;\n", + j2c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B4, load0B4;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B5, load0B5;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B6, load0B6;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load0B7, load0B7;\n", + + j3c36 => "08:-:-:-:1 \@P0 F2F.F32.F16 load1B0, load1B0;\n", + j3c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B1;\n", + j3c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B2;\n", + j3c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B3;\n", + j3c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B4, load1B4;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B5, load1B5;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B6, load1B6;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load1B7, load1B7;\n", + + j4c36 => "10:-:-:-:1 \@P0 F2F.F32.F16 load2B0, load2B0;\n", + j4c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B1;\n", + j4c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B2;\n", + j4c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B3;\n", + j4c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B4, load2B4;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B5, load2B5;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B6, load2B6;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load2B7, load2B7;\n", + + j5c36 => "20:-:-:-:1 \@P0 F2F.F32.F16 load3B0, load3B0;\n", + j5c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B1;\n", + j5c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B2;\n", + j5c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B3, load3B3;\n", + j5c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B4, load3B4;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B5, load3B5;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B6, load3B6;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load3B7, load3B7;\n", + + j2c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<0*16>], load0A0;\n", + j2c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*16>], load0A1;\n", + j2c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*16>], load0A2;\n", + j2c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*16>], load0A3;\n", + j2c24 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*16>], load0A4;\n", + j2c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*16>], load0A5;\n", + j2c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*16>], load0A6;\n", + j2c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<7*16>], load0A7;\n", + + j3c16 => "04:-:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 0*16>], load0B0;\n", + j3c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 0*16>], load0B1;\n", + j3c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 0*16>], load0B2;\n", + j3c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 0*16>], load0B3;\n", + j3c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 0*16>], load0B4;\n", + j3c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 0*16>], load0B5;\n", + j3c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 0*16>], load0B6;\n", + j3c30 => "--:3:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 0*16>], load0B7;\n", + + j4c16 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 1*16>], load1B0;\n", + j4c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 1*16>], load1B1;\n", + j4c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 1*16>], load1B2;\n", + j4c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 1*16>], load1B3;\n", + j4c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 1*16>], load1B4;\n", + j4c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 1*16>], load1B5;\n", + j4c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 1*16>], load1B6;\n", + j4c30 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 1*16>], load1B7;\n", + + j5c16 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 2*16>], load2B0;\n", + j5c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 2*16>], load2B1;\n", + j5c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 2*16>], load2B2;\n", + j5c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 2*16>], load2B3;\n", + j5c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 2*16>], load2B4;\n", + j5c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 2*16>], load2B5;\n", + j5c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 2*16>], load2B6;\n", + j5c30 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 2*16>], load2B7;\n", + + j6c16 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 3*16>], load3B0;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 3*16>], load3B1;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 3*16>], load3B2;\n", + j6c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 3*16>], load3B3;\n", + j6c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 3*16>], load3B4;\n", + j6c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 3*16>], load3B5;\n", + j6c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 3*16>], load3B6;\n", + j6c30 => "--:6:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 3*16>], load3B7;\n", + + j2c48 => "02:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j2c50 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j2c52 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j2c54 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + j2c56 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n", + j2c58 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n", + j2c60 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n", + j2c62 => "--:-:2:-:1 \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n", + + j3c48 => "04:-:-:-:1 \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c50 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j3c52 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j3c54 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + j3c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];\n", + j3c58 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];\n", + j3c60 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];\n", + j3c62 => "--:-:3:-:1 \@P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];\n", + + j4c48 => "08:-:-:-:1 \@P4 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j4c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j4c52 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j4c54 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + j4c56 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];\n", + j4c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];\n", + j4c60 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];\n", + j4c62 => "--:-:4:-:1 \@P4 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];\n", + + j5c48 => "10:-:-:-:1 \@P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j5c50 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j5c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j5c54 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + j5c56 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B4, [track2B + 2x<4>];\n", + j5c58 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B5, [track2B + 2x<5>];\n", + j5c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B6, [track2B + 2x<6>];\n", + j5c62 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load2B7, [track2B + 2x<7>];\n", + + j6c48 => "20:-:-:-:1 \@P6 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j6c50 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j6c52 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j6c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + j6c56 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B4, [track3B + 2x<4>];\n", + j6c58 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B5, [track3B + 2x<5>];\n", + j6c60 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B6, [track3B + 2x<6>];\n", + j6c62 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 load3B7, [track3B + 2x<7>];\n", + ) + ), + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out = ''; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*16 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*16 + 08>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHR.U32 tid16, tid, 4; +--:-:-:-:1 SHL tid15, tid15, 2; +--:-:-:-:1 ISCADD readCs, tid16, tid15, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*64 + tid15; +--:-:-:-:1 ISCADD cx, blkB, tid15, 6; +--:-:-:-:1 IADD cx1, cx, 1; +--:-:-:-:1 IADD cx2, cx, 2; +--:-:-:-:1 IADD cx3, cx, 3; + +// cy = blkA*16 + tid16 +--:-:-:-:1 ISCADD cy, blkA, tid16, 4; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 SHL ldc8, ldc, 4; + +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C1, ci, param_C[1], RZ, 1; + +// P0 = cx < n +--:-:-:-:1 ISETP.LT.AND P0, PT, cx, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, cx1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, cx2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, cx3, param_n, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +// P4 = cy < m +--:-:-:-:1 ISETP.LT.AND P4, PT, cy, param_m, PT; + +// P5 = beta != 0 && P4 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P4; + +// P6 = Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; + +// Init beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:5 EXIT; + +STORE_C: + +[+ + our $vec; + return $vec ? q{ +--:-:1:-:1 @P0 LDG.E.64 loadC, [C]; + } : q{ +--:-:-:-:0 @!P0 MOV loadC0, RZ; +--:-:-:-:1 @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>]; +--:-:-:-:0 @!P1 MOV loadC1, RZ; +--:-:-:-:1 @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>]; +--:-:-:-:0 @!P2 MOV loadC2, RZ; +--:-:-:-:1 @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>]; +--:-:-:-:0 @!P3 MOV loadC3, RZ; +--:-:1:-:1 @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>]; + }; ++] + +// Restore output preds +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 LDS.U.128 part0C, [readCs + 4x<0*8*64>]; +--:-:2:-:1 LDS.U.128 part1C, [readCs + 4x<1*8*64>]; +--:-:-:-:1 LDS.U.128 part2C, [readCs + 4x<2*8*64>]; +--:-:3:-:1 LDS.U.128 part3C, [readCs + 4x<3*8*64>]; +--:-:-:-:1 LDS.U.128 part4C, [readCs + 4x<4*8*64>]; +--:-:4:-:1 LDS.U.128 part5C, [readCs + 4x<5*8*64>]; +--:-:-:-:1 LDS.U.128 part6C, [readCs + 4x<6*8*64>]; +--:-:5:-:1 LDS.U.128 part7C, [readCs + 4x<7*8*64>]; + + +02:-:-:-:1 @P0 FADD part0C0, part0C0, part1C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part1C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part1C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part1C3; + +04:-:-:-:1 @P0 FADD part2C0, part2C0, part3C0; +--:-:-:-:1 @P1 FADD part2C1, part2C1, part3C1; +--:-:-:-:1 @P2 FADD part2C2, part2C2, part3C2; +--:-:-:-:1 @P3 FADD part2C3, part2C3, part3C3; + +08:-:-:-:1 @P0 FADD part4C0, part4C0, part5C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part5C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part5C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part5C3; + +10:-:-:-:1 @P0 FADD part6C0, part6C0, part7C0; +--:-:-:-:1 @P1 FADD part6C1, part6C1, part7C1; +--:-:-:-:1 @P2 FADD part6C2, part6C2, part7C2; +--:-:-:-:1 @P3 FADD part6C3, part6C3, part7C3; + +--:-:-:-:1 @P0 FADD part0C0, part0C0, part2C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part2C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part2C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part2C3; + +--:-:-:-:1 @P0 FADD part4C0, part4C0, part6C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part6C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part6C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part6C3; + +--:-:-:-:1 @P0 FADD c0, part0C0, part4C0; +--:-:-:-:1 @P1 FADD c1, part0C1, part4C1; +--:-:-:-:1 @P2 FADD c2, part0C2, part4C2; +--:-:-:-:1 @P3 FADD c3, part0C3, part4C3; + + +--:-:-:-:0 IADD cy, cy, 8; + +[+ + our $vec; + return $vec ? q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0.H0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC0.H1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC1.H0; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC1.H1; + } : q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC2; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC3; + }; ++] + +01:-:-:-:1 @P5 FFMA c0, b0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, b1, beta, c1; +04:-:-:-:1 @P5 FFMA c2, b2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, b3, beta, c3; + +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:0 ISETP.LT.AND P5, PT, cy, param_m, P5; + +--:-:1:-:1 @P0 F2F.F16.F32 c0, c0; +--:-:2:-:1 @P1 F2F.F16.F32 c1, c1; + +--:-:-:-:0 ISETP.LT.AND P4, PT, cy, param_m, PT; + +--:-:3:-:1 @P2 F2F.F16.F32 c2, c2; +--:-:4:-:1 @P3 F2F.F16.F32 c3, c3; + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:2 @P0 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 @P0 BFI c1, c3, 0x1010, c2; + +--:1:-:-:1 @P0 STG.E.CG.64 [C], c; + } : q{ +01:-:-:-:1 @P0 STG.E.U16 [C + 2x<0>], c0; +02:-:-:-:1 @P1 STG.E.U16 [C + 2x<1>], c1; +04:-:-:-:1 @P2 STG.E.U16 [C + 2x<2>], c2; +08:1:-:-:1 @P3 STG.E.U16 [C + 2x<3>], c3; + }; ++] + +// Restore beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +01:-:-:-:6 IADD C0.CC, C0, ldc8; +--:-:-:-:0 IADD.X C1, C1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_nt_32x128.sass b/Kernel/SGEMM/Maxwell/hgemm_nt_32x128.sass new file mode 100644 index 0000000..eef6e5e --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nt_32x128.sass @@ -0,0 +1,588 @@ +# Kernel: hgemm_nt_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + (32*16 + 32)*2> + szShareA : (32*16 + 32) + szShareB : (128*16 + 32) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ tidX, lda, ldb, ldaz, ldbz, ldb32, tid1, tid3, tid96, ta, tb00, tb32, tb64, tb96, xmad_ta, xmad_tb, shiftX, tidY<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadA<0-3> + 84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + + 100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 110-120 ~ writeAs, writeBs, k, tidY, txa, txb00, txb32, txb64, txb96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; +--:-:3:-:1 S2R blkA, SR_CTAID.Y; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb32, ldb, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidX = tid >> 2 +// tidY = (tid & 3) << 2 +// shiftX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidY, tid3, 2; +--:-:-:-:1 SHL shiftX, tid3, 3; + +// trackA += ((blkA*32 + tidX) * lda + tidAY) * 2 +04:-:-:-:1 ISCADD txa, blkA, tidX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 1; + +// trackB += ((blkB*128 + tidX) * ldb + tidY) * 2 +02:-:-:-:1 ISCADD txb00, blkB, tidX, 7; +--:-:-:-:1 IADD txb32, txb00, 32; +--:-:-:-:1 IADD txb64, txb00, 64; +--:-:-:-:1 IADD txb96, txb00, 96; + +--:-:-:-:1 XMAD.LO tb00, ldb, txb00, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb00, ldbz, blkZ, tb00; +--:-:-:-:1 IADD tb32, tb00, ldb32; +--:-:-:-:1 IADD tb64, tb32, ldb32; +--:-:-:-:1 IADD tb96, tb64, ldb32; + +--:-:-:-:1 LEA track0B0.CC, tb00, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb00, param_B[1], RZ, 1; +--:-:-:-:1 LEA track1B0.CC, tb32, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb32, param_B[1], RZ, 1; +--:-:-:-:1 LEA track2B0.CC, tb64, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb64, param_B[1], RZ, 1; +--:-:-:-:1 LEA track3B0.CC, tb96, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb96, param_B[1], RZ, 1; + +// writeAs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 7; +--:-:-:-:1 IADD writeBs, writeBs, shiftX; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readBs, readBs, tid96; +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb64, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb96, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.CI.64 load0B, [track0B]; +--:-:2:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:3:-:1 @P4 LDG.E.CI.64 load2B, [track2B]; +--:-:4:-:1 @P5 LDG.E.CI.64 load3B, [track3B]; +--:-:5:-:1 @P6 LDG.E.CI.64 loadA, [trackA]; + + + +--:-:6:-:1 @!P2 LDS.U.64 load0B, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 load1B, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.64 load2B, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.64 load3B, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.64 loadA, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidY1, tidY, 1; +--:-:-:-:1 IADD tidY2, tidY, 2; +--:-:-:-:1 IADD tidY3, tidY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P4; + +--:-:1:-:1 @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:1:-:1 @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:1:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:1:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb64, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb96, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P6; + +--:-:5:-:1 @P0 LDG.E.CI.U16 loadA0, [trackA + 2x<0>]; +--:-:5:-:1 @P1 LDG.E.CI.U16 loadA1, [trackA + 2x<1>]; +--:-:5:-:1 @P2 LDG.E.CI.U16 loadA2, [trackA + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 loadA3, [trackA + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb32, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P0, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P0; + + + + + our $vec; + return $vec ? q{ +21:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:1:-:1 F2F.F32.F16 load0B0, load0B0.H0; + +02:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + +04:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:3:-:1 F2F.F32.F16 load2B0, load2B0.H0; + +08:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:4:-:1 F2F.F32.F16 load3B0, load3B0.H0; + +10:-:-:-:1 F2F.F32.F16 loadA3, loadA1.H1; +--:-:-:-:1 F2F.F32.F16 loadA2, loadA1.H0; +--:-:-:-:1 F2F.F32.F16 loadA1, loadA0.H1; +--:-:5:-:1 F2F.F32.F16 loadA0, loadA0.H0; + } : q{ +21:-:-:-:1 F2F.F32.F16 load0B0, load0B0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:1:-:1 F2F.F32.F16 load0B3, load0B3; + +02:-:-:-:1 F2F.F32.F16 load1B0, load1B0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:2:-:1 F2F.F32.F16 load1B3, load1B3; + +04:-:-:-:1 F2F.F32.F16 load2B0, load2B0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:3:-:1 F2F.F32.F16 load2B3, load2B3; + +08:-:-:-:1 F2F.F32.F16 load3B0, load3B0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:4:-:1 F2F.F32.F16 load3B3, load3B3; + +10:-:-:-:1 F2F.F32.F16 loadA0, loadA0; +--:-:-:-:1 F2F.F32.F16 loadA1, loadA1; +--:-:-:-:1 F2F.F32.F16 loadA2, loadA2; +--:-:5:-:1 F2F.F32.F16 loadA3, loadA3; + }; + + +01:-:-:-:1 STS [writeBs + 4x<0*128 + 0*32>], load0B0; +--:-:-:-:0 IADD track0B0.CC, track0B0, 2x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 0*32>], load0B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 0*32>], load0B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 0*32>], load0B3; + +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +02:-:-:-:1 STS [writeBs + 4x<0*128 + 1*32>], load1B0; +--:-:-:-:0 IADD track1B0.CC, track1B0, 2x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 1*32>], load1B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 1*32>], load1B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 1*32>], load1B3; + +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +04:-:-:-:1 STS [writeBs + 4x<0*128 + 2*32>], load2B0; +--:-:-:-:0 IADD track2B0.CC, track2B0, 2x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 2*32>], load2B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 2*32>], load2B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 2*32>], load2B3; + +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +08:-:-:-:1 STS [writeBs + 4x<0*128 + 3*32>], load3B0; +--:-:-:-:0 IADD track3B0.CC, track3B0, 2x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 3*32>], load3B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 3*32>], load3B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 3*32>], load3B3; + +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +10:-:-:-:1 STS [writeAs + 4x<0*32>], loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*32>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*32>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*32>], loadA3; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.64 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:5:-:1 @P4 LDG.E.CI.64 load2B, [track2B]; +--:-:5:-:1 @P5 LDG.E.CI.64 load3B, [track3B]; +--:-:6:-:1 @P6 LDG.E.CI.64 loadA, [trackA]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:5:-:1 @P4 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 1; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 0*32>], load0B0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 0*32>], load0B1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 0*32>], load0B2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 0*32>], load0B3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 1*32>], load1B0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 1*32>], load1B1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 1*32>], load1B2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 1*32>], load1B3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 2*32>], load2B0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 2*32>], load2B1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 2*32>], load2B2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 2*32>], load2B3;\n", + + j9c6 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 3*32>], load3B0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 3*32>], load3B1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 3*32>], load3B2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 3*32>], load3B3;\n", + + j11c6 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<0*32>], loadA0;\n", + j11c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], loadA1;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], loadA2;\n", + j11c12 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<3*32>], loadA3;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0B0.CC, track0B0, 2x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0B1, track0B1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, 2x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2B0.CC, track2B0, 2x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2B1, track2B1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3B0.CC, track3B0, 2x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3B1, track3B1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackA0.CC, trackA0, 2x<16>;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackA1, trackA1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.64 load0B, [track0B];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1B, [track1B];\n", + j9c29 => "10:-:5:-:1 \@P4 LDG.E.CI.64 load2B, [track2B];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.64 load3B, [track3B];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.64 loadA, [trackA];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1B3, load1B1.H1;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B2, load1B1.H0;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B1, load1B0.H1;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j6c13 => "10:-:-:-:1 \@P4 F2F.F32.F16 load2B3, load2B1.H1;\n", + j6c17 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2B2, load2B1.H0;\n", + j6c21 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2B1, load2B0.H1;\n", + j6c25 => "--:-:5:-:1 \@P4 F2F.F32.F16 load2B0, load2B0.H0;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B3, load3B1.H1;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B2, load3B1.H0;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B1, load3B0.H1;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadA3, loadA1.H1;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA2, loadA1.H0;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA1, loadA0.H1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA0, loadA0.H0;\n", + ) : + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c31 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j4c1 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j5c31 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j6c1 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + + j9c29 => "10:-:5:-:1 \@P4 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j9c31 => "--:-:5:-:1 \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j10c1 => "--:-:5:-:1 \@P4 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j10c3 => "--:-:5:-:1 \@P4 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + + j10c8 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j10c10 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j10c12 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];\n", + j11c31 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];\n", + j12c1 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0B0, load0B0;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B1, load0B1;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B2, load0B2;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0B3, load0B3;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1B0, load1B0;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B1, load1B1;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B2, load1B2;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1B3, load1B3;\n", + + j6c13 => "10:-:-:-:1 \@P4 F2F.F32.F16 load2B0, load2B0;\n", + j6c17 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2B1, load2B1;\n", + j6c21 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2B2, load2B2;\n", + j6c25 => "--:-:5:-:1 \@P4 F2F.F32.F16 load2B3, load2B3;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B0, load3B0;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B1, load3B1;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B2, load3B2;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3B3, load3B3;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadA0, loadA0;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA1, loadA1;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA2, loadA2;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA3, loadA3;\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_nt_32x32.sass b/Kernel/SGEMM/Maxwell/hgemm_nt_32x32.sass new file mode 100644 index 0000000..1225d7d --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_nt_32x32.sass @@ -0,0 +1,1067 @@ +# Kernel: hgemm_nt_32x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 16x<32*65> + szShareA : (32*65) + szShareB : (32*65) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 64-95 ~ tidX, tidY, tidY<1-3>, lda, ldb, ldaz, ldbz, lda16, ldb16, tid1, tid16, tid16_8, ta<00|16>, txa<00|16>, tb<00|16>, txb<00|16>, xmad_ta, xmad_tb, shiftX, predsY0, predsY4, partialK + + 96-127 : load0A<0-7>, load1A<0-7>, load0B<0-7>, load1B<0-7> + 128-135 : track0A<0-1>, track1A<0-1>, track0B<0-1>, track1B<0-1> + + 136-142 ~ swapBuf, readAs, readBs, writeAs, writeBs, k + 143-149 ~ tid, blkA, blkB, blkZ, writeCs, preds + + 0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3> + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + 96-99 : loadC<0-3> + 100-103 : b<0-3> + 104-107 : c<0-3> + 108-109 : C<0-1> + 110-142 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc16, readCs, alpha, beta, flags, tid7, tid8 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL lda16, lda, 4; +--:-:-:-:1 SHL ldb16, ldb, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// tidX = tid >> 3 +// tidY = (tid & 7) << 3 +// shiftX = (tid & 7) << 2 +01:-:-:-:1 SHR.U32 tidX, tid, 3; +--:-:-:-:1 LOP.AND tidY, tid, 7; +--:-:-:-:1 SHL shiftX, tidY, 2; +--:-:-:-:1 SHL tidY, tidY, 3; + +// trackA += ((blkA*32 + tidX) * lda + tidY) * 2 +02:-:-:-:1 ISCADD txa00, blkA, tidX, 5; +--:-:-:-:1 IADD txa16, txa00, 16; +--:-:-:-:1 XMAD.LO ta00, lda, txa00, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta00, ldaz, blkZ, ta00; +--:-:-:-:1 IADD ta16, ta00, lda16; +--:-:-:-:1 LEA track0A0.CC, ta00, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta00, param_A[1], RZ, 1; +--:-:-:-:1 LEA track1A0.CC, ta16, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track1A1, ta16, param_A[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa16, param_m, PT; + +// trackB += ((blkB*32 + tidX) * ldb + tidY) * 2 +04:-:-:-:1 ISCADD txb00, blkB, tidX, 5; +--:-:-:-:1 IADD txb16, txb00, 16; +--:-:-:-:1 XMAD.LO tb00, ldb, txb00, tidY, xmad_tb; +--:-:-:-:1 XMAD.LO2 tb00, ldbz, blkZ, tb00; +--:-:-:-:1 IADD tb16, tb00, ldb16; +--:-:-:-:1 LEA track0B0.CC, tb00, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb00, param_B[1], RZ, 1; +--:-:-:-:1 LEA track1B0.CC, tb16, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb16, param_B[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb16, param_n, PT; + +--:-:-:-:1 P2R preds, PR, RZ, 0x3c; + +// writeAs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftX; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 5; +--:-:-:-:1 IADD writeBs, writeBs, shiftX; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + + +// readAs = (((tid & 8) >> 2) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 8; +--:-:-:-:1 SHR.U32 readAs, readAs, 2; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 3) << 4 +--:-:-:-:1 BFE.U32 readBs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 SHL readBs, readBs, 4; + +// tid16 = tid & -16 +// tid16_8 = tid16 / 2 * 4 +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHL tid16_8, tid16, 1; + +// writeCs = (readAs + tid16*4) * 32 + readBs; +--:-:-:-:1 ISCADD writeCs, tid16, readAs, 2; +--:-:-:-:1 ISCADD writeCs, writeCs, readBs, 5; + +// Each block of 16 threads works on 8 lines, shifted over by 4 +// readAs += tid16_8 * 32 + tid16 +// readBs += tid16_8 * 32 + tid16 + 4x +--:-:-:-:1 ISCADD readAs, tid16_8, readAs, 5; +--:-:-:-:1 ISCADD readBs, tid16_8, readBs, 5; +--:-:-:-:1 IADD readAs, tid16, readAs; +--:-:-:-:1 IADD3 readBs, tid16, 4x, readBs; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// If k is not a multiple of 64 we want to grab the partial amount on the first fetch. +// If it is a multiple of 64 then make a full 64 line fetch. +--:-:-:-:1 LOP.AND.Z P0, partialK, k, 63; +--:-:-:-:1 @P0 MOV partialK, 64; +--:-:-:-:1 IADD k, k, -partialK; +[+ + our $vec; + return $vec ? q{ + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x3c; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x3c; + + +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.128 load1A, [track1A]; +--:-:4:-:1 @P4 LDG.E.CI.128 load0B, [track0B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load1B, [track1B]; + + + +--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.128 load1A, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 load0B, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 load1B, [addr_zero]; + + + } : q{ +--:-:-:-:1 IADD tidY1, tidY, 1; +--:-:-:-:1 IADD tidY2, tidY, 2; +--:-:-:-:1 IADD tidY3, tidY, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, partialK, PT; +--:-:-:-:1 P2R predsY0, PR, RZ, 0x0f; + +--:-:-:-:1 IADD tidY, tidY, 4; +--:-:-:-:1 IADD tidY1, tidY1, 4; +--:-:-:-:1 IADD tidY2, tidY2, 4; +--:-:-:-:1 IADD tidY3, tidY3, 4; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, partialK, PT; +--:-:-:-:1 P2R predsY4, PR, RZ, 0x0f; + + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 @P4 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 @P4 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load0A4, RZ; +--:-:-:-:1 @!P1 MOV load0A5, RZ; +--:-:-:-:1 @!P2 MOV load0A6, RZ; +--:-:-:-:1 @!P3 MOV load0A7, RZ; + + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa16, param_m, PT; +--:-:-:-:1 @P5 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 @P5 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1A4, [track1A + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1A5, [track1A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1A6, [track1A + 2x<6>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load1A4, RZ; +--:-:-:-:1 @!P1 MOV load1A5, RZ; +--:-:-:-:1 @!P2 MOV load1A6, RZ; +--:-:-:-:1 @!P3 MOV load1A7, RZ; + + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb00, param_n, PT; +--:-:-:-:1 @P6 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 @P6 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0B4, [track0B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0B5, [track0B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0B6, [track0B + 2x<6>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load0B4, RZ; +--:-:-:-:1 @!P1 MOV load0B5, RZ; +--:-:-:-:1 @!P2 MOV load0B6, RZ; +--:-:-:-:1 @!P3 MOV load0B7, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb16, param_n, PT; +--:-:-:-:1 @P4 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 @P4 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1B4, [track1B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1B5, [track1B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1B6, [track1B + 2x<6>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 load1B7, [track1B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load1B4, RZ; +--:-:-:-:1 @!P1 MOV load1B5, RZ; +--:-:-:-:1 @!P2 MOV load1B6, RZ; +--:-:-:-:1 @!P3 MOV load1B7, RZ; + }; ++] +--:-:-:-:1 SHL partialK, partialK, 1; + +--:-:-:-:1 ISETP.GE.AND P0, PT, k, 64, PT; +--:-:-:-:1 IADD k, k, -64; +--:-:-:-:1 @P0 R2P PR, preds, 0x3c; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x3c; + + +[+ + our $vec; + return $vec ? q{ +22:-:-:-:1 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A2.H1; +--:-:6:-:1 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0.H0; + } : q{ +02:-:-:-:1 F2F.F32.F16 load0A7, load0A7; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A6; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A5; +--:-:6:-:1 F2F.F32.F16 load0A4, load0A4; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A3; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0; + }; ++] +--:-:-:-:0 IADD track0A0.CC, track0A0, partialK; +20:-:-:-:1 STS [writeAs + 4x<7*32 + 0*16>], load0A7; +--:-:-:-:1 STS [writeAs + 4x<6*32 + 0*16>], load0A6; +--:-:-:-:1 STS [writeAs + 4x<5*32 + 0*16>], load0A5; +--:-:-:-:1 STS [writeAs + 4x<4*32 + 0*16>], load0A4; +02:-:-:-:1 STS [writeAs + 4x<3*32 + 0*16>], load0A3; +--:-:-:-:1 STS [writeAs + 4x<2*32 + 0*16>], load0A2; +--:-:-:-:1 STS [writeAs + 4x<1*32 + 0*16>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<0*32 + 0*16>], load0A0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +[+ + our $vec; + return $vec ? q{ +04:-:-:-:1 F2F.F32.F16 load1A7, load1A3.H1; +--:-:-:-:1 F2F.F32.F16 load1A6, load1A3.H0; +--:-:-:-:1 F2F.F32.F16 load1A5, load1A2.H1; +--:-:6:-:1 F2F.F32.F16 load1A4, load1A2.H0; +--:-:-:-:1 F2F.F32.F16 load1A3, load1A1.H1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A1.H0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A0.H1; +--:-:2:-:1 F2F.F32.F16 load1A0, load1A0.H0; + } : q{ +04:-:-:-:1 F2F.F32.F16 load1A7, load1A7; +--:-:-:-:1 F2F.F32.F16 load1A6, load1A6; +--:-:-:-:1 F2F.F32.F16 load1A5, load1A5; +--:-:6:-:1 F2F.F32.F16 load1A4, load1A4; +--:-:-:-:1 F2F.F32.F16 load1A3, load1A3; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A2; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A1; +--:-:2:-:1 F2F.F32.F16 load1A0, load1A0; + }; ++] +--:-:-:-:0 IADD track1A0.CC, track1A0, partialK; +20:-:-:-:1 STS [writeAs + 4x<7*32 + 1*16>], load1A7; +--:-:-:-:1 STS [writeAs + 4x<6*32 + 1*16>], load1A6; +--:-:-:-:1 STS [writeAs + 4x<5*32 + 1*16>], load1A5; +--:-:-:-:1 STS [writeAs + 4x<4*32 + 1*16>], load1A4; +02:-:-:-:1 STS [writeAs + 4x<3*32 + 1*16>], load1A3; +--:-:-:-:1 STS [writeAs + 4x<2*32 + 1*16>], load1A2; +--:-:-:-:1 STS [writeAs + 4x<1*32 + 1*16>], load1A1; +--:-:-:-:1 STS [writeAs + 4x<0*32 + 1*16>], load1A0; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +[+ + our $vec; + return $vec ? q{ +08:-:-:-:1 F2F.F32.F16 load0B7, load0B3.H1; +--:-:-:-:1 F2F.F32.F16 load0B6, load0B3.H0; +--:-:-:-:1 F2F.F32.F16 load0B5, load0B2.H1; +--:-:6:-:1 F2F.F32.F16 load0B4, load0B2.H0; +--:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:2:-:1 F2F.F32.F16 load0B0, load0B0.H0; + } : q{ +08:-:-:-:1 F2F.F32.F16 load0B7, load0B7; +--:-:-:-:1 F2F.F32.F16 load0B6, load0B6; +--:-:-:-:1 F2F.F32.F16 load0B5, load0B5; +--:-:6:-:1 F2F.F32.F16 load0B4, load0B4; +--:-:-:-:1 F2F.F32.F16 load0B3, load0B3; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:2:-:1 F2F.F32.F16 load0B0, load0B0; + }; ++] +--:-:-:-:0 IADD track0B0.CC, track0B0, partialK; +20:-:-:-:1 STS [writeBs + 4x<7*32 + 0*16>], load0B7; +--:-:-:-:1 STS [writeBs + 4x<6*32 + 0*16>], load0B6; +--:-:-:-:1 STS [writeBs + 4x<5*32 + 0*16>], load0B5; +--:-:-:-:1 STS [writeBs + 4x<4*32 + 0*16>], load0B4; +02:-:-:-:1 STS [writeBs + 4x<3*32 + 0*16>], load0B3; +--:-:-:-:1 STS [writeBs + 4x<2*32 + 0*16>], load0B2; +--:-:-:-:1 STS [writeBs + 4x<1*32 + 0*16>], load0B1; +--:-:-:-:1 STS [writeBs + 4x<0*32 + 0*16>], load0B0; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +[+ + our $vec; + return $vec ? q{ +10:-:-:-:1 F2F.F32.F16 load1B7, load1B3.H1; +--:-:-:-:1 F2F.F32.F16 load1B6, load1B3.H0; +--:-:-:-:1 F2F.F32.F16 load1B5, load1B2.H1; +--:-:6:-:1 F2F.F32.F16 load1B4, load1B2.H0; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + } : q{ +10:-:-:-:1 F2F.F32.F16 load1B7, load1B7; +--:-:-:-:1 F2F.F32.F16 load1B6, load1B6; +--:-:-:-:1 F2F.F32.F16 load1B5, load1B5; +--:-:6:-:1 F2F.F32.F16 load1B4, load1B4; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B3; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0; + }; ++] +--:-:-:-:0 IADD track1B0.CC, track1B0, partialK; +20:-:-:-:1 STS [writeBs + 4x<7*32 + 1*16>], load1B7; +--:-:-:-:1 STS [writeBs + 4x<6*32 + 1*16>], load1B6; +--:-:-:-:1 STS [writeBs + 4x<5*32 + 1*16>], load1B5; +--:-:-:-:1 STS [writeBs + 4x<4*32 + 1*16>], load1B4; +02:-:-:-:1 STS [writeBs + 4x<3*32 + 1*16>], load1B3; +--:-:-:-:1 STS [writeBs + 4x<2*32 + 1*16>], load1B2; +--:-:-:-:1 STS [writeBs + 4x<1*32 + 1*16>], load1B1; +--:-:-:-:1 STS [writeBs + 4x<0*32 + 1*16>], load1B0; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*32 + 16>]; + +[+ + our $vec; + return $vec ? q{ +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.128 load1A, [track1A]; +--:-:4:-:1 @P4 LDG.E.CI.128 load0B, [track0B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load1B, [track1B]; + } : q{ +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A4, [track1A + 2x<4>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A5, [track1A + 2x<5>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A6, [track1A + 2x<6>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>]; + +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B4, [track0B + 2x<4>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B5, [track0B + 2x<5>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B6, [track0B + 2x<6>]; +--:-:4:-:1 @P4 LDG.E.CI.U16 load0B7, [track0B + 2x<7>]; + +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B4, [track1B + 2x<4>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B5, [track1B + 2x<5>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B6, [track1B + 2x<6>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load1B7, [track1B + 2x<7>]; + }; ++] + +LOOP: + +[+ + our $vec; + our %insert = + ( + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, RZ, PT;\n", + j0c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, k, 64, PT;\n" . + "--:-:-:-:1 IADD k, k, -64;\n", + + j0c23 => "--:-:-:-:1 \@P1 R2P PR, preds, 0x3c;\n", + j0c24 => "--:-:-:-:1 \@!P1 R2P PR, RZ, 0x3c;\n", + + j3c32 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<64>;\n", + j3c37 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j4c32 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 2x<64>;\n", + j4c37 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j5c32 => "--:-:-:-:1 \@P4 IADD track0B0.CC, track0B0, 2x<64>;\n", + j5c37 => "--:-:-:-:1 \@P4 IADD.X track0B1, track0B1, RZ;\n", + j6c32 => "--:-:-:-:1 \@P5 IADD track1B0.CC, track1B0, 2x<64>;\n", + j6c37 => "--:-:-:-:1 \@P5 IADD.X track1B1, track1B1, RZ;\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j2c45 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n", + j2c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n", + j2c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n", + j2c57 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n", + j2c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n", + j3c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n", + j3c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n", + j3c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j3c45 => "04:-:-:-:1 \@P0 F2F.F32.F16 load1A7, load1A3.H1;\n", + j3c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A6, load1A3.H0;\n", + j3c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A5, load1A2.H1;\n", + j3c57 => "--:-:3:-:1 \@P0 F2F.F32.F16 load1A4, load1A2.H0;\n", + j3c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A3, load1A1.H1;\n", + j4c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A2, load1A1.H0;\n", + j4c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A1, load1A0.H1;\n", + j4c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load1A0, load1A0.H0;\n", + + j4c45 => "08:-:-:-:1 \@P0 F2F.F32.F16 load0B7, load0B3.H1;\n", + j4c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B6, load0B3.H0;\n", + j4c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B5, load0B2.H1;\n", + j4c57 => "--:-:4:-:1 \@P0 F2F.F32.F16 load0B4, load0B2.H0;\n", + j4c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n", + j5c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n", + j5c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n", + j5c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j5c45 => "10:-:-:-:1 \@P0 F2F.F32.F16 load1B7, load1B3.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B6, load1B3.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B5, load1B2.H1;\n", + j5c57 => "--:-:5:-:1 \@P0 F2F.F32.F16 load1B4, load1B2.H0;\n", + j5c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n", + j6c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n", + j6c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n", + j6c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j3c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<7*32 + 0*16>], load0A7;\n", + j3c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32 + 0*16>], load0A6;\n", + j3c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32 + 0*16>], load0A5;\n", + j3c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*32 + 0*16>], load0A4;\n", + j3c24 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<3*32 + 0*16>], load0A3;\n", + j3c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32 + 0*16>], load0A2;\n", + j3c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32 + 0*16>], load0A1;\n", + j3c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<0*32 + 0*16>], load0A0;\n", + + j4c16 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<7*32 + 1*16>], load1A7;\n", + j4c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32 + 1*16>], load1A6;\n", + j4c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32 + 1*16>], load1A5;\n", + j4c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*32 + 1*16>], load1A4;\n", + j4c24 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<3*32 + 1*16>], load1A3;\n", + j4c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32 + 1*16>], load1A2;\n", + j4c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32 + 1*16>], load1A1;\n", + j4c30 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<0*32 + 1*16>], load1A0;\n", + + j5c16 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<7*32 + 0*16>], load0B7;\n", + j5c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*32 + 0*16>], load0B6;\n", + j5c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*32 + 0*16>], load0B5;\n", + j5c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*32 + 0*16>], load0B4;\n", + j5c24 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<3*32 + 0*16>], load0B3;\n", + j5c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*32 + 0*16>], load0B2;\n", + j5c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*32 + 0*16>], load0B1;\n", + j5c30 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<0*32 + 0*16>], load0B0;\n", + + j6c16 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<7*32 + 1*16>], load1B7;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*32 + 1*16>], load1B6;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*32 + 1*16>], load1B5;\n", + j6c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*32 + 1*16>], load1B4;\n", + j6c24 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<3*32 + 1*16>], load1B3;\n", + j6c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*32 + 1*16>], load1B2;\n", + j6c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*32 + 1*16>], load1B1;\n", + j6c30 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<0*32 + 1*16>], load1B0;\n", + + j3c62 => "02:-:2:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j4c62 => "04:-:3:-:1 \@P3 LDG.E.CI.128 load1A, [track1A];\n", + j5c62 => "08:-:4:-:1 \@P4 LDG.E.CI.128 load0B, [track0B];\n", + j6c62 => "10:-:5:-:1 \@P5 LDG.E.CI.128 load1B, [track1B];\n", + ) : + ( + j2c45 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A0, load0A0;\n", + j2c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A1;\n", + j2c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A2;\n", + j2c57 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A3, load0A3;\n", + j2c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A4;\n", + j3c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A5;\n", + j3c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A6;\n", + j3c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load0A7, load0A7;\n", + + j3c45 => "04:-:-:-:1 \@P0 F2F.F32.F16 load1A0, load1A0;\n", + j3c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A1, load1A1;\n", + j3c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A2, load1A2;\n", + j3c57 => "--:-:3:-:1 \@P0 F2F.F32.F16 load1A3, load1A3;\n", + j3c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A4, load1A4;\n", + j4c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A5, load1A5;\n", + j4c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A6, load1A6;\n", + j4c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load1A7, load1A7;\n", + + j4c45 => "08:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0;\n", + j4c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B1;\n", + j4c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B2;\n", + j4c57 => "--:-:4:-:1 \@P0 F2F.F32.F16 load0B3, load0B3;\n", + j4c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B4, load0B4;\n", + j5c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B5, load0B5;\n", + j5c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B6, load0B6;\n", + j5c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load0B7, load0B7;\n", + + j5c45 => "10:-:-:-:1 \@P0 F2F.F32.F16 load1B0, load1B0;\n", + j5c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B1;\n", + j5c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B2;\n", + j5c57 => "--:-:5:-:1 \@P0 F2F.F32.F16 load1B3, load1B3;\n", + j5c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B4, load1B4;\n", + j6c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B5, load1B5;\n", + j6c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B6, load1B6;\n", + j6c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load1B7, load1B7;\n", + + j3c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<0*32 + 0*16>], load0A0;\n", + j3c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32 + 0*16>], load0A1;\n", + j3c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32 + 0*16>], load0A2;\n", + j3c22 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<3*32 + 0*16>], load0A3;\n", + j3c24 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<4*32 + 0*16>], load0A4;\n", + j3c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32 + 0*16>], load0A5;\n", + j3c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32 + 0*16>], load0A6;\n", + j3c30 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<7*32 + 0*16>], load0A7;\n", + + j4c16 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*32 + 1*16>], load1A0;\n", + j4c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32 + 1*16>], load1A1;\n", + j4c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32 + 1*16>], load1A2;\n", + j4c22 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<3*32 + 1*16>], load1A3;\n", + j4c24 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<4*32 + 1*16>], load1A4;\n", + j4c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32 + 1*16>], load1A5;\n", + j4c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32 + 1*16>], load1A6;\n", + j4c30 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<7*32 + 1*16>], load1A7;\n", + + j5c16 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<0*32 + 0*16>], load0B0;\n", + j5c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*32 + 0*16>], load0B1;\n", + j5c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*32 + 0*16>], load0B2;\n", + j5c22 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<3*32 + 0*16>], load0B3;\n", + j5c24 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<4*32 + 0*16>], load0B4;\n", + j5c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*32 + 0*16>], load0B5;\n", + j5c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*32 + 0*16>], load0B6;\n", + j5c30 => "--:6:-:-:1 \@P0 STS [writeBs + 4x<7*32 + 0*16>], load0B7;\n", + + j6c16 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*32 + 1*16>], load1B0;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*32 + 1*16>], load1B1;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*32 + 1*16>], load1B2;\n", + j6c22 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<3*32 + 1*16>], load1B3;\n", + j6c24 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<4*32 + 1*16>], load1B4;\n", + j6c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*32 + 1*16>], load1B5;\n", + j6c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*32 + 1*16>], load1B6;\n", + j6c30 => "--:6:-:-:1 \@P0 STS [writeBs + 4x<7*32 + 1*16>], load1B7;\n", + + j3c48 => "02:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j3c50 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j3c52 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j3c54 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + j3c56 => "20:-:-:-:1 \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n", + j3c58 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n", + j3c60 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n", + j3c62 => "--:-:2:-:1 \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n", + + j4c48 => "04:-:-:-:1 \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n", + j4c50 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n", + j4c52 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n", + j4c54 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n", + j4c56 => "20:-:-:-:1 \@P3 LDG.E.CI.U16 load1A4, [track1A + 2x<4>];\n", + j4c58 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A5, [track1A + 2x<5>];\n", + j4c60 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A6, [track1A + 2x<6>];\n", + j4c62 => "--:-:3:-:1 \@P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>];\n", + + j5c48 => "08:-:-:-:1 \@P4 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j5c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j5c52 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j5c54 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + j5c56 => "20:-:-:-:1 \@P4 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];\n", + j5c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];\n", + j5c60 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];\n", + j5c62 => "--:-:4:-:1 \@P4 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];\n", + + j6c48 => "10:-:-:-:1 \@P5 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j6c50 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j6c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j6c54 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + j6c56 => "20:-:-:-:1 \@P5 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];\n", + j6c58 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];\n", + j6c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];\n", + j6c62 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];\n", + ) + ), + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out = ''; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// readCs = ((tid & 7) * 4 + (tid / 8) * 32) * 4 +--:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHR.U32 tid8, tid, 3; +--:-:-:-:1 SHL tid7, tid7, 2; +--:-:-:-:1 ISCADD readCs, tid8, tid7, 5; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid7; +--:-:-:-:1 ISCADD cx, blkB, tid7, 5; +--:-:-:-:1 IADD cx1, cx, 1; +--:-:-:-:1 IADD cx2, cx, 2; +--:-:-:-:1 IADD cx3, cx, 3; + +// cy = blkA*32 + tid8 +--:-:-:-:1 ISCADD cy, blkA, tid8, 5; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 SHL ldc16, ldc, 5; + +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C1, ci, param_C[1], RZ, 1; + +// P0 = cx < n +--:-:-:-:1 ISETP.LT.AND P0, PT, cx, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, cx1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, cx2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, cx3, param_n, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +// P4 = cy < m +--:-:-:-:1 ISETP.LT.AND P4, PT, cy, param_m, PT; + +// P5 = beta != 0 && P4 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P4; + +// P6 = Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; + +// Init beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*32 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*32 + 16>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*32 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*32 + 16>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*32 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*32 + 16>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32 + 16>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*32 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*32 + 16>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*32 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*32 + 16>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*32 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*32 + 16>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:5 EXIT; + +STORE_C: + +[+ + our $vec; + return $vec ? q{ +--:-:1:-:1 @P0 LDG.E.64 loadC, [C]; + } : q{ +--:-:-:-:0 @!P0 MOV loadC0, RZ; +--:-:-:-:1 @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>]; +--:-:-:-:0 @!P1 MOV loadC1, RZ; +--:-:-:-:1 @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>]; +--:-:-:-:0 @!P2 MOV loadC2, RZ; +--:-:-:-:1 @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>]; +--:-:-:-:0 @!P3 MOV loadC3, RZ; +--:-:1:-:1 @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>]; + }; ++] + +// Restore output preds +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 LDS.U.128 part0C, [readCs + 4x<0*16*32>]; +--:-:2:-:1 LDS.U.128 part1C, [readCs + 4x<1*16*32>]; +--:-:-:-:1 LDS.U.128 part2C, [readCs + 4x<2*16*32>]; +--:-:3:-:1 LDS.U.128 part3C, [readCs + 4x<3*16*32>]; +--:-:-:-:1 LDS.U.128 part4C, [readCs + 4x<4*16*32>]; +--:-:4:-:1 LDS.U.128 part5C, [readCs + 4x<5*16*32>]; +--:-:-:-:1 LDS.U.128 part6C, [readCs + 4x<6*16*32>]; +--:-:5:-:1 LDS.U.128 part7C, [readCs + 4x<7*16*32>]; + + +02:-:-:-:1 @P0 FADD part0C0, part0C0, part1C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part1C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part1C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part1C3; + +04:-:-:-:1 @P0 FADD part2C0, part2C0, part3C0; +--:-:-:-:1 @P1 FADD part2C1, part2C1, part3C1; +--:-:-:-:1 @P2 FADD part2C2, part2C2, part3C2; +--:-:-:-:1 @P3 FADD part2C3, part2C3, part3C3; + +08:-:-:-:1 @P0 FADD part4C0, part4C0, part5C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part5C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part5C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part5C3; + +10:-:-:-:1 @P0 FADD part6C0, part6C0, part7C0; +--:-:-:-:1 @P1 FADD part6C1, part6C1, part7C1; +--:-:-:-:1 @P2 FADD part6C2, part6C2, part7C2; +--:-:-:-:1 @P3 FADD part6C3, part6C3, part7C3; + +--:-:-:-:1 @P0 FADD part0C0, part0C0, part2C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part2C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part2C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part2C3; + +--:-:-:-:1 @P0 FADD part4C0, part4C0, part6C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part6C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part6C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part6C3; + +--:-:-:-:1 @P0 FADD c0, part0C0, part4C0; +--:-:-:-:1 @P1 FADD c1, part0C1, part4C1; +--:-:-:-:1 @P2 FADD c2, part0C2, part4C2; +--:-:-:-:1 @P3 FADD c3, part0C3, part4C3; + + +--:-:-:-:0 IADD cy, cy, 16; + +[+ + our $vec; + return $vec ? q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0.H0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC0.H1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC1.H0; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC1.H1; + } : q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC2; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC3; + }; ++] + +01:-:-:-:1 @P5 FFMA c0, b0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, b1, beta, c1; +04:-:-:-:1 @P5 FFMA c2, b2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, b3, beta, c3; + +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:0 ISETP.LT.AND P5, PT, cy, param_m, P5; + +--:-:1:-:1 @P0 F2F.F16.F32 c0, c0; +--:-:2:-:1 @P1 F2F.F16.F32 c1, c1; + +--:-:-:-:0 ISETP.LT.AND P4, PT, cy, param_m, PT; + +--:-:3:-:1 @P2 F2F.F16.F32 c2, c2; +--:-:4:-:1 @P3 F2F.F16.F32 c3, c3; + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:2 @P0 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 @P0 BFI c1, c3, 0x1010, c2; + +--:1:-:-:1 @P0 STG.E.CG.64 [C], c; + } : q{ +01:-:-:-:1 @P0 STG.E.U16 [C + 2x<0>], c0; +02:-:-:-:1 @P1 STG.E.U16 [C + 2x<1>], c1; +04:-:-:-:1 @P2 STG.E.U16 [C + 2x<2>], c2; +08:1:-:-:1 @P3 STG.E.U16 [C + 2x<3>], c3; + }; ++] + +// Restore beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +01:-:-:-:6 IADD C0.CC, C0, ldc16; +--:-:-:-:0 IADD.X C1, C1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_tn_128x128.sass b/Kernel/SGEMM/Maxwell/hgemm_tn_128x128.sass new file mode 100644 index 0000000..c2beee1 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_tn_128x128.sass @@ -0,0 +1,360 @@ +# Kernel: hgemm_tn_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +[- + +our $int16; + +our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + +sub convert_in {return $convert;} + + +sub int16_params { + return $int16 ? q{ +param_Stats[0] : c[0x0][0x190] +param_Stats[1] : c[0x0][0x194] +param_scale : c[0x0][0x198] + } : ""; +} +-] + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + [+ int16_params() +] + + + + + 64-95 ~ lda, ldb, ldaz, ldbz, tid1, tid7, tid31, tid128, tid15, tidX, blk, x<1-3>, y<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-103 : loadA<0-3>, loadB<0-3> + + 104-107 : trackA<0-1>, trackB<0-1> + + 108-118 ~ writeS, k, txa, txb, tidY, ta, tb, loop + 119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ + + 64-75 ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128 + + 64-79 : c<0-7>, d3, d2, d1, d0, cs<0-3> + 64-65 : Stats<0-1> + 80-89 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 90-118 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV loop, RZ; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); ++] + +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND tid128, tid, 128; + +// tidX = (tid & 31) << 2 +// tidY = (tid >> 5) & 7 +01:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL tidX, tid31, 2; +--:-:-:-:1 BFE.U32 tidY, tid, 0x305; // 3 bits at position 5 + +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 4; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// trackA += (blkA*128 + lda*tidY + tidX) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// trackB += (blkB*128 + ldb*tidY + tidX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidX, 7; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeS, writeS, 4x<128*8*2>, 2; + + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + + + +[+ + our $vec; + return $vec ? q{ + +// doLoad = tidY < k && txa|txb < n|m +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P6; + +--:-:2:-:1 @P2 LDG.E.CI.64 loadA, [trackA]; +--:-:3:-:1 @P3 LDG.E.CI.64 loadB, [trackB]; + +--:-:5:-:1 @!P2 LDS.U.64 loadA, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 loadB, [addr_zero]; + + // Vec 4 and scalar loads + } : q{ + +// doLoadA = tidY < k && txa < m +// doLoadB = tidY < k && txb < n +--:-:-:-:1 IADD x1, txa, 1; +--:-:-:-:1 IADD x2, txa, 2; +--:-:-:-:1 IADD x3, txa, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_m, P0; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<00 + 0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<00 + 1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<00 + 2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 IADD y1, txb, 1; +--:-:-:-:1 IADD y2, txb, 2; +--:-:-:-:1 IADD y3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_n, P0; + +--:-:3:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>]; +--:-:3:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>]; +--:-:3:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>]; +--:-:3:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + }; ++] + + + +[+ + our $vec; + our $convert; + return $vec ? qq{ +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 7; + +12:-:-:-:4 $convert loadA3, loadA1.H1; +--:-:-:-:0 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:4 $convert loadA2, loadA1.H0; +--:-:-:-:4 $convert loadA1, loadA0.H1; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; +--:-:2:-:2 $convert loadA0, loadA0.H0; + +02:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadA; + +24:-:-:-:4 $convert loadB3, loadB1.H1; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 $convert loadB2, loadB1.H0; +--:-:-:-:4 $convert loadB1, loadB0.H1; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; +--:-:3:-:2 $convert loadB0, loadB0.H0; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; + +04:-:-:-:1 STS.128 [writeS + 4x<8*128>], loadB; + + // scalar loads + } : qq{ +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +02:-:-:-:4 $convert loadA0, loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:4 $convert loadA1, loadA1; +--:-:-:-:4 $convert loadA2, loadA2; +--:-:2:-:2 $convert loadA3, loadA3; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadA0; + +04:-:-:-:4 $convert loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 $convert loadB1, loadB1; +--:-:-:-:4 $convert loadB2, loadB2; +--:-:3:-:2 $convert loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +04:-:-:-:1 STS.128 [writeS + 4x<8*128>], loadB0; + + }; ++] + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<128*8*2>; + + +[+ + our $vec; + our $convert; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n"); + + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + ($vec ? + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI.64 loadA0, [trackA];\n", + j0c13 => "--:-:3:-:1 \@P3 LDG.E.CI.64 loadB0, [trackB];\n", + + j5c1 => "02:-:-:-:1 \@P2 $convert loadA3, loadA1.H1;\n", + j5c5 => "--:-:-:-:1 \@P2 $convert loadA2, loadA1;\n", + j5c9 => "--:-:-:-:1 \@P2 $convert loadA1, loadA0.H1;\n", + j5c13 => "--:-:2:-:1 \@P2 $convert loadA0, loadA0;\n", + + j6c1 => "04:-:-:-:1 \@P3 $convert loadB3, loadB1.H1;\n", + j6c5 => "--:-:-:-:1 \@P3 $convert loadB2, loadB1;\n", + j6c9 => "--:-:-:-:1 \@P3 $convert loadB1, loadB0.H1;\n", + j6c13 => "--:-:3:-:1 \@P3 $convert loadB0, loadB0;\n", + ) : + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n", + j0c12 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n", + j0c14 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n", + j0c16 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n", + + j0c29 => "--:-:3:-:1 \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j0c31 => "--:-:3:-:1 \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j0c33 => "--:-:3:-:1 \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j0c35 => "--:-:3:-:1 \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j5c1 => "02:-:-:-:1 \@P2 $convert loadA0, loadA0;\n", + j5c5 => "--:-:-:-:1 \@P2 $convert loadA1, loadA1;\n", + j5c9 => "--:-:-:-:1 \@P2 $convert loadA2, loadA2;\n", + j5c13 => "--:-:2:-:1 \@P2 $convert loadA3, loadA3;\n", + + j6c1 => "04:-:-:-:1 \@P3 $convert loadB0, loadB0;\n", + j6c5 => "--:-:-:-:1 \@P3 $convert loadB1, loadB1;\n", + j6c9 => "--:-:-:-:1 \@P3 $convert loadB2, loadB2;\n", + j6c13 => "--:-:3:-:1 \@P3 $convert loadB3, loadB3;\n", + ) + ), + + j5c31 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*128>], loadA;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, param_lda8;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c31 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<8*128>], loadB;\n", + + j6c46 => "--:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" . + "--:-:-:-:1 IADD32I k, k, -8;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; ++] + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_tn_128x16.sass b/Kernel/SGEMM/Maxwell/hgemm_tn_128x16.sass new file mode 100644 index 0000000..5cd8cce --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_tn_128x16.sass @@ -0,0 +1,554 @@ +# Kernel: hgemm_tn_128x16 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 16*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 16-17 : Rand<0-1> + + 18-47 ~ lda, ldb, ldaz, ldbz, lda8, ldb8, ta, tb, tid1, tid96, tidAX, tidBX, tidY, txa, txb, dimA, flag + + 0-15 : czero<00-15> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + + 16-23 : j0Ay<0-3>, j0Bx<0-3> + 24-31 : j1Ay<0-3>, j1Bx<0-3> + 32-39 : j2Ay<0-3>, j2Bx<0-3> + 40-47 : j3Ay<0-3>, j3Bx<0-3> + + 48-55 : load0A<0-7> + 56-63 : load1A<0-7> + 64-71 : load2A<0-7> + 72-79 : load3A<0-7> + + 80-83 : load<0-3>B + + 84-87 : track0A<0-1>, track0B<0-1> + 88-91 : track1A<0-1>, track1B<0-1> + 92-95 : track2A<0-1>, track2B<0-1> + 96-99 : track3A<0-1>, track3B<0-1> + + 100-104 ~ writeAs, writeBs, k, lda32, ldb32 + 105-112 ~ readAs, readBs, tid, blkA, blkB, blkZ, tbid, seed + + 16-25 : c<0-3>, b<0-1>, d3, d2, d1, d0 + 26-27 : Cy<0-1> + 28-104 ~ ldc, ldcz, ldc1, writeCs, readCs, tidCX, tidCY, cx, cy, ci, xmad_c, alpha, beta, flags, tid31, lfsr<0-2>, exp<0-3>, rand<0-3>, lfsr<0-2>_1, lfsr<0-2>_2, clk_shf1, clk_shf2 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 STS.128 [addr_zero], RZ; + +--:-:-:-:1 LDS.U.128 czero00, [addr_zero]; +--:-:-:-:1 LDS.U.128 czero04, [addr_zero]; +--:-:-:-:1 LDS.U.128 czero08, [addr_zero]; +--:-:-:-:1 LDS.U.128 czero12, [addr_zero]; + +// Grab a seed for this thread +// (blkB*gridDimA*256 + blkA*256 + tid) & (1024*256 - 1) +--:-:-:-:1 MOV flag, param_flags; +--:-:-:-:1 LOP.AND.NZ P4, RZ, flag, 0x1; +--:-:-:-:1 MOV dimA, gridDimA; +03:-:-:-:1 ISCADD tbid, blkA, tid, 8; +04:-:-:-:1 XMAD.U16.U16 dimA, blkB, dimA, RZ; +--:-:-:-:1 ISCADD tbid, dimA, tbid, 8; +--:-:-:-:1 LOP.AND seed, tbid, 1x<2048*32 - 1>; +--:-:-:-:1 LEA Rand0.CC, seed, param_Rand[0], 0x2; +--:-:-:-:1 LEA.HI.X Rand1, seed, param_Rand[1], RZ, 0x2; +--:-:-:-:1 @P4 LDG.E.CS seed, [Rand]; + +// tidBX = tid & 15 +// tidAX = (tid & 15) << 3 +// tidY = (tid >> 4) & 7 +01:-:-:-:1 LOP.AND tidBX, tid, 15; +--:-:-:-:1 SHL tidAX, tidBX, 3; +--:-:-:-:1 BFE.U32 tidY, tid, 0x304; // 3 bits at position 4 + +--:-:-:-:1 MOV lda8, param_lda8; +--:-:-:-:1 MOV ldb8, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda8, 4; +--:-:-:-:1 SHR.U32 ldb, ldb8, 4; +--:-:-:-:1 SHL lda32, lda8, 2; +--:-:-:-:1 SHL ldb32, ldb8, 2; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + + +// trackA += (blkA*128 + lda*tidY + tidAX) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// trackB += (blkB*16 + ldb*tidY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 4; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA track0B0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X track0B1, tb, param_B[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeAs = (128*tidY + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidAX, 7; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (16*tidY + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidBX, 4; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<128*8>, 2; + +// Start the read buffers low +// readAs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readAs, readAs, tid96; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid & 0x10) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readBs, tid, 0x10; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 IADD track1A0.CC, track0A0, lda8; +--:-:-:-:1 IADD.X track1A1, track0A1, RZ; +--:-:-:-:1 IADD track1B0.CC, track0B0, ldb8; +--:-:-:-:1 IADD.X track1B1, track0B1, RZ; + +--:-:-:-:1 IADD track2A0.CC, track1A0, lda8; +--:-:-:-:1 IADD.X track2A1, track1A1, RZ; +--:-:-:-:1 IADD track2B0.CC, track1B0, ldb8; +--:-:-:-:1 IADD.X track2B1, track1B1, RZ; + +--:-:-:-:1 IADD track3A0.CC, track2A0, lda8; +--:-:-:-:1 IADD.X track3A1, track2A1, RZ; +--:-:-:-:1 IADD track3B0.CC, track2B0, ldb8; +--:-:-:-:1 IADD.X track3B1, track2B1, RZ; + + +--:-:3:-:1 @P5 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P6 LDG.E.CI.S16 load0B, [track0B]; + +--:-:4:-:1 @P5 LDG.E.CI.128 load1A, [track1A]; +--:-:4:-:1 @P6 LDG.E.CI.S16 load1B, [track1B]; + +--:-:5:-:1 @P5 LDG.E.CI.128 load2A, [track2A]; +--:-:5:-:1 @P6 LDG.E.CI.S16 load2B, [track2B]; + +--:-:6:-:1 @P5 LDG.E.CI.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.S16 load3B, [track3B]; + + +--:-:-:-:1 ISETP.GE.AND P0, PT, k, 32, PT; +--:-:-:-:1 ISETP.GT.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GT.AND P4, PT, k, 32, P6; +--:-:-:-:1 IADD k, k, -32; + + +04:-:-:-:4 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:4 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:0 IADD track0A0.CC, track0A0, lda32; +--:-:-:-:4 F2F.F32.F16 load0A5, load0A2.H1; +--:-:1:-:4 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; +--:-:-:-:4 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:4 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:0 IADD track0B0.CC, track0B0, ldb32; +--:-:-:-:4 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:4 F2F.F32.F16 load0A0, load0A0.H0; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; +--:-:3:-:1 F2F.F32.F16 load0B, load0B; + +01:-:-:-:1 STS.128 [writeAs + 4x<0*(128*8 + 16*8) + 4>], load0A4; +02:-:-:-:1 STS.128 [writeAs + 4x<0*(128*8 + 16*8) + 0>], load0A0; +04:-:-:-:1 STS [writeBs + 4x<0*(128*8 + 16*8) + 0>], load0B; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 0*(128*8 + 16*8)>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*16 + 0*(128*8 + 16*8)>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 0*(128*8 + 16*8)>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*16 + 0*(128*8 + 16*8)>]; +--:-:3:-:1 @P3 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P4 LDG.E.CI.S16 load0B, [track0B]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + + foreach my $k (0 .. 3) + { + my $shareBuf = ($k + 1) & 1; + my $store = ($k + 1) & 3; + my $loadBar = $store + 3; + my $storBar = sprintf '%02x', 1 << ($store + 2); + + %insert = + ( + j0c11 => "$storBar:-:-:-:1 \@P0 F2F.F32.F16 load${store}A7, load${store}A3.H1;\n", + j0c15 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A6, load${store}A3.H0;\n", + j1c3 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A5, load${store}A2.H1;\n", + j1c7 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A4, load${store}A2.H0;\n", + j1c11 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A3, load${store}A1.H1;\n", + j1c15 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A2, load${store}A1.H0;\n", + j2c3 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A1, load${store}A0.H1;\n", + j2c7 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A0, load${store}A0.H0;\n", + j2c11 => "--:-:$loadBar:-:1 \@P0 F2F.F32.F16 load${store}B, load${store}B;\n", + + j2c12 => "--:-:-:-:1 \@P0 IADD track${store}A0.CC, track${store}A0, lda32;\n", + j3c1 => "--:-:-:-:1 \@P0 IADD.X track${store}A1, track${store}A1, RZ;\n", + j3c3 => "--:-:-:-:1 \@P0 IADD track${store}B0.CC, track${store}B0, ldb32;\n", + j3c8 => "--:-:-:-:1 \@P0 IADD.X track${store}B1, track${store}B1, RZ;\n", + + j3c9 => "$storBar:-:-:-:1 \@P0 STS.128 [writeAs + 4x<$shareBuf*(128*8 + 16*8) + 0>], load${store}A0;\n", + j4c4 => "--:-:-:-:1 \@P0 STS.128 [writeAs + 4x<$shareBuf*(128*8 + 16*8) + 4>], load${store}A4;\n", + j4c6 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<$shareBuf*(128*8 + 16*8) + 0>], load${store}B;\n", + + j5c15 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n", + + j6c1 => "--:-:$loadBar:-:1 \@P3 LDG.E.CI.128 load${store}A, [track${store}A];\n", + j6c3 => "--:-:$loadBar:-:1 \@P4 LDG.E.CI.S16 load${store}B, [track${store}B];\n", + + ($k == 3 ? + ( + j0c4 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 32, PT;\n", + j0c6 => "--:-:-:-:1 ISETP.GT.AND P3, PT, k, 32, P5;\n", + j0c8 => "--:-:-:-:1 ISETP.GT.AND P4, PT, k, 32, P6;\n", + j0c10 => "--:-:-:-:1 IADD k, k, -32;\n", + + j7c15 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : () + ), + ); + + foreach my $j (0 .. 7) + { + my $rsPred = $j >= 6 && $k == 3 ? '@P0' : ' '; + my $barrier = $j & 1 ? 2 : 1; + my $loadReg = ($j + 2) & 3; + my $compute = $j & 3; + my $shareLine = ($j + 2) & 7; + $shareBuf = $j >= 6 ? ($k + 1) & 1 : $k & 1; + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + %d*(128*8 + 16*8)>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shareBuf; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*16 + %d*(128*8 + 16*8)>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shareBuf; + + foreach my $c (0 .. 15) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 8 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + $out .= "\n"; + } + return $out; + + + +// + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 16 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0x1ff; +--:-:-:-:1 LOP.AND readBs, readBs, 0x1ff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 2; + +// tidCX = (tid & 3) << 2 +// tidCY = tid >> 2 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tidCX, tid, 3; +--:-:-:-:1 SHL tidCX, tidCX, 2; +--:-:-:-:1 SHR.U32 tidCY, tid, 2; + +// readCs = (tidCY*16 + tidCX) << 2; +--:-:-:-:1 ISCADD readCs, tidCY, tidCX, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*16 + tidCX; +--:-:-:-:1 ISCADD cx, blkB, tidCX, 4; + +// cy = blkA*128 + tidCY*4 +--:-:-:-:1 SHL cy, tidCY, 2; +--:-:-:-:1 ISCADD cy, blkA, cy, 7; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA Cy0.CC, ci, param_C[0], 1; +--:-:-:-:0 LEA.HI.X Cy1, ci, param_C[1], RZ, 1; + +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; + +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + +// Random Round flag +--:-:-:-:2 LOP.AND.NZ P4, RZ, flags, 1; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P3, RZ, flags, 2; + +--:-:-:-:1 SHL ldc1, ldc, 1; + +// Seed the Tausworthe +--:-:-:-:1 LOP.XOR lfsr0, seed, tbid; +--:-:-:-:1 CS2R lfsr1, SR_CLOCKLO; +--:-:-:-:1 CS2R lfsr2, SR_GLOBALTIMERLO; +--:-:-:-:1 LOP.AND clk_shf1, lfsr1, 31; +--:-:-:-:1 LOP.AND clk_shf2, lfsr2, 31; +--:-:-:-:1 LOP.XOR clk_shf1, clk_shf1, tid31; +--:-:-:-:1 LOP.XOR clk_shf2, clk_shf2, tid31; +--:-:-:-:1 SHF.R.U64 lfsr1, lfsr1, clk_shf1, lfsr1; +--:-:-:-:1 SHF.R.U64 lfsr2, lfsr2, clk_shf2, lfsr2; +--:-:-:-:1 LOP.AND tbid, tbid, 1x<2048*32 - 1>; + + + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..3) + { + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:6 LEA Rand0.CC, tbid, param_Rand[0], 0x2; +--:-:-:-:1 LEA.HI.X Rand1, tbid, param_Rand[1], RZ, 0x2; +--:-:-:-:2 LOP3.LUT seed, lfsr0, lfsr1, lfsr2, 0x96; +--:-:-:-:1 @P4 STG.E.CS [Rand], seed; + +--:-:-:-:5 EXIT; + + +STORE_C: + +--:-:-:-:2 ISETP.LT.AND P1, PT, cy, param_m, P5; +--:-:-:Y:b ISETP.LT.AND P0, PT, cy, param_m, P6; +--:-:-:-:0 IADD cy, cy, 1; + +--:-:1:-:1 @P1 LDG.E.64 b0, [Cy]; + +// Apply relu +--:-:-:-:1 @P3 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P3 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P3 FMNMX c2, c2, RZ, !PT; +--:-:-:-:4 @P3 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:5:-:1 LDS.U.128 c0, [readCs]; + +01:-:1:-:4 @P1 F2F.F32.F16 d3, b1.H1; +--:-:2:-:4 @P1 F2F.F32.F16 d2, b1.H0; +--:-:3:-:4 @P1 F2F.F32.F16 d1, b0.H1; +--:-:4:-:1 @P1 F2F.F32.F16 d0, b0.H0; + +11:-:-:-:1 @P1 FFMA c3, d3, beta, c3; +02:-:-:-:1 @P1 FFMA c2, d2, beta, c2; +04:-:-:-:1 @P1 FFMA c1, d1, beta, c1; +08:-:-:-:0 @P1 FFMA c0, d0, beta, c0; + +--:-:-:-:5 @P4 BRA.U DO_RANDOM1; + +--:-:1:-:4 F2F.F16.F32 c0, c0; +--:-:2:-:4 F2F.F16.F32 c1, c1; +--:-:3:-:4 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +--:-:-:-:5 BRA.U END_ROUND1; + +DO_RANDOM1: + +--:-:-:-:5 CAL RANDOM_ROUND; + +END_ROUND1: + +// Pack 2 16 bit values into 32 bit words +03:-:-:-:2 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 BFI c1, c3, 0x1010, c2; + +--:1:-:-:2 @P0 STG.E.64 [Cy], c0; + +01:-:-:-:6 IADD Cy0.CC, Cy0, ldc1; +--:-:-:-:0 IADD.X Cy1, Cy1, RZ; + +--:-:-:-:5 RET; + +RANDOM_ROUND: + + + +// Strip mantissa and leave sign+exponent +--:-:-:-:1 LOP32I.AND exp0, c0, 0xff800000; +--:-:-:-:1 LOP32I.AND exp1, c1, 0xff800000; +--:-:-:-:1 LOP32I.AND exp2, c2, 0xff800000; +--:-:-:-:1 LOP32I.AND exp3, c3, 0xff800000; + +// Find the exponent that will shift 32 bits of integer data +// out past the lsb of this number as an fp16 +// exp *= 2^-10 * 2^-32 (2^-42) +--:-:-:-:1 FMUL32I exp0, exp0, 0x2a800000; +--:-:-:-:1 FMUL32I exp1, exp1, 0x2a800000; +--:-:-:-:1 FMUL32I exp2, exp2, 0x2a800000; +--:-:-:-:1 FMUL32I exp3, exp3, 0x2a800000; + +// lfsr0 = ((lfsr0 & 0xfffffffe) << 12) ^ (((lfsr0 << 13) ^ lfsr0) >> 19); +--:-:-:-:1 LOP32I.AND lfsr0_1, lfsr0, 0xfffffffe; +--:-:-:-:1 SHL lfsr0_1, lfsr0_1, 12; +--:-:-:-:1 SHL lfsr0_2, lfsr0, 13; +--:-:-:-:1 LOP.XOR lfsr0_2, lfsr0_2, lfsr0; +--:-:-:-:1 SHR.U32 lfsr0_2, lfsr0_2, 19; +--:-:-:-:1 LOP.XOR lfsr0, lfsr0_1, lfsr0_2; + +// lfsr1 = ((lfsr1 & 0xfffffff8) << 4) ^ (((lfsr1 << 2) ^ lfsr1) >> 25); +--:-:-:-:1 LOP32I.AND lfsr1_1, lfsr1, 0xfffffff8; +--:-:-:-:1 SHL lfsr1_1, lfsr1_1, 4; +--:-:-:-:1 SHL lfsr1_2, lfsr1, 2; +--:-:-:-:1 LOP.XOR lfsr1_2, lfsr1_2, lfsr1; +--:-:-:-:1 SHR.U32 lfsr1_2, lfsr1_2, 25; +--:-:-:-:1 LOP.XOR lfsr1, lfsr1_1, lfsr1_2; + +// lfsr2 = ((lfsr2 & 0xfffffff0) << 11) ^ (((lfsr2 << 3) ^ lfsr2) >> 11); +--:-:-:-:1 LOP32I.AND lfsr2_1, lfsr2, 0xfffffff0; +--:-:-:-:1 SHL lfsr2_1, lfsr2_1, 11; +--:-:-:-:1 SHL lfsr2_2, lfsr2, 3; +--:-:-:-:1 LOP.XOR lfsr2_2, lfsr2_2, lfsr2; +--:-:-:-:1 SHR.U32 lfsr2_2, lfsr2_2, 11; +--:-:-:-:1 LOP.XOR lfsr2, lfsr2_1, lfsr2_2; + +// rand = lfsr0 ^ lfsr1 ^ lfsr2; +// generate 3 other rotations of this rand +--:-:-:-:1 LOP3.LUT rand0, lfsr0, lfsr1, lfsr2, 0x96; +--:-:-:-:1 SHF.R.U64 rand1, rand0, 8, rand0; +--:-:-:-:1 SHF.R.U64 rand2, rand0, 16, rand0; +--:-:-:-:0 SHF.R.U64 rand3, rand0, 24, rand0; +//--:-:-:-:1 MOV32I rand0, 0x80000000; +//--:-:-:-:1 MOV32I rand1, 0x80000000; +//--:-:-:-:1 MOV32I rand2, 0x80000000; +//--:-:-:-:1 MOV32I rand3, 0x80000000; + + +// Convert rand to float +--:-:1:-:4 I2F.F32.U32.RZ rand0, rand0; +--:-:2:-:4 I2F.F32.U32.RZ rand1, rand1; +--:-:3:-:4 I2F.F32.U32.RZ rand2, rand2; +--:-:4:-:1 I2F.F32.U32.RZ rand3, rand3; + +// Scale the random number so msb is one below lsb of fp16 +// Add scaled random to number to round +01:-:-:-:1 FFMA.RZ c0, rand0, exp0, c0; +02:-:-:-:1 FFMA.RZ c1, rand1, exp1, c1; +04:-:-:-:1 FFMA.RZ c2, rand2, exp2, c2; +08:-:-:-:0 FFMA.RZ c3, rand3, exp3, c3; + +// Truncate number to fp16 +--:-:1:-:4 F2F.F16.F32.RZ c0, c0; +--:-:2:-:4 F2F.F16.F32.RZ c1, c1; +--:-:3:-:4 F2F.F16.F32.RZ c2, c2; +--:-:4:-:1 F2F.F16.F32.RZ c3, c3; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/hgemm_tn_128x32.sass b/Kernel/SGEMM/Maxwell/hgemm_tn_128x32.sass new file mode 100644 index 0000000..239d5d3 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_tn_128x32.sass @@ -0,0 +1,553 @@ +# Kernel: hgemm_tn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + 32*16*2> + szShareA : 128*16 + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 4; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL lda16, lda, 5; +--:-:-:-:1 SHL ldb16, ldb, 5; +--:-:-:-:1 SHL lda4, lda, 2; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = (tid & 31) << 2 +// tidAY = (tid >> 5) +01:-:-:-:1 LOP.AND tidAX, tid, 31; +--:-:-:-:1 SHL tidAX, tidAX, 2; +--:-:-:-:1 SHR.U32 tidAY, tid, 5; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += (blkA*128 + tidAX + lda*tidAY) * 4 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO2 ta0, lda, tidAY, txa; +08:-:-:-:1 XMAD.LO2 ta0, ldaz, blkZ, ta0; +--:-:-:-:1 IADD ta1, ta0, lda4; +--:-:-:-:1 IADD ta2, ta1, lda4; +--:-:-:-:1 IADD ta3, ta2, lda4; + +--:-:-:-:1 LEA track0A0.CC, ta0, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta0, param_A[1], RZ, 1; +--:-:-:-:1 LEA track1A0.CC, ta1, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track1A1, ta1, param_A[1], RZ, 1; +--:-:-:-:1 LEA track2A0.CC, ta2, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track2A1, ta2, param_A[1], RZ, 1; +--:-:-:-:1 LEA track3A0.CC, ta3, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track3A1, ta3, param_A[1], RZ, 1; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 1; + +// writeAs = (tidAY*128 + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidAY1, tidAY, 4; +--:-:-:-:1 IADD tidAY2, tidAY, 8; +--:-:-:-:1 IADD tidAY3, tidAY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.CI.64 load0A, [track0A]; +--:-:2:-:1 @P1 LDG.E.CI.64 load1A, [track1A]; +--:-:3:-:1 @P2 LDG.E.CI.64 load2A, [track2A]; +--:-:4:-:1 @P3 LDG.E.CI.64 load3A, [track3A]; +--:-:5:-:1 @P4 LDG.E.CI.64 loadB, [trackB]; + + + +--:-:6:-:1 @!P0 LDS.U.64 load0A, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.64 load1A, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.64 load2A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 load3A, [addr_zero]; +--:-:6:-:2 @!P4 LDS.U.64 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD txa1, txa, 1; +--:-:-:-:1 IADD txa2, txa, 2; +--:-:-:-:1 IADD txa3, txa, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P4; + +--:-:1:-:1 @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:1:-:1 @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:1:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:1:-:1 @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:2:-:1 @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P6; + +--:-:3:-:1 @P0 LDG.E.CI.U16 load2A0, [track2A + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.U16 load2A1, [track2A + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load2A2, [track2A + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load2A3, [track2A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:4:-:1 @P0 LDG.E.CI.U16 load3A0, [track3A + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.U16 load3A1, [track3A + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.U16 load3A2, [track3A + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load3A3, [track3A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:5:-:1 @P0 LDG.E.CI.U16 loadB0, [trackB + 2x<0>]; +--:-:5:-:1 @P1 LDG.E.CI.U16 loadB1, [trackB + 2x<1>]; +--:-:5:-:1 @P2 LDG.E.CI.U16 loadB2, [trackB + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 loadB3, [trackB + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 15; + + + + + our $vec; + return $vec ? q{ +21:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:1:-:1 F2F.F32.F16 load0A0, load0A0.H0; + +02:-:-:-:1 F2F.F32.F16 load1A3, load1A1.H1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A1.H0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A0.H1; +--:-:2:-:1 F2F.F32.F16 load1A0, load1A0.H0; + +04:-:-:-:1 F2F.F32.F16 load2A3, load2A1.H1; +--:-:-:-:1 F2F.F32.F16 load2A2, load2A1.H0; +--:-:-:-:1 F2F.F32.F16 load2A1, load2A0.H1; +--:-:3:-:1 F2F.F32.F16 load2A0, load2A0.H0; + +08:-:-:-:1 F2F.F32.F16 load3A3, load3A1.H1; +--:-:-:-:1 F2F.F32.F16 load3A2, load3A1.H0; +--:-:-:-:1 F2F.F32.F16 load3A1, load3A0.H1; +--:-:4:-:1 F2F.F32.F16 load3A0, load3A0.H0; + +10:-:-:-:1 F2F.F32.F16 loadB3, loadB1.H1; +--:-:-:-:1 F2F.F32.F16 loadB2, loadB1.H0; +--:-:-:-:1 F2F.F32.F16 loadB1, loadB0.H1; +--:-:5:-:1 F2F.F32.F16 loadB0, loadB0.H0; + } : q{ +21:-:-:-:1 F2F.F32.F16 load0A0, load0A0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:1:-:1 F2F.F32.F16 load0A3, load0A3; + +02:-:-:-:1 F2F.F32.F16 load1A0, load1A0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A2; +--:-:2:-:1 F2F.F32.F16 load1A3, load1A3; + +04:-:-:-:1 F2F.F32.F16 load2A0, load2A0; +--:-:-:-:1 F2F.F32.F16 load2A1, load2A1; +--:-:-:-:1 F2F.F32.F16 load2A2, load2A2; +--:-:3:-:1 F2F.F32.F16 load2A3, load2A3; + +08:-:-:-:1 F2F.F32.F16 load3A0, load3A0; +--:-:-:-:1 F2F.F32.F16 load3A1, load3A1; +--:-:-:-:1 F2F.F32.F16 load3A2, load3A2; +--:-:4:-:1 F2F.F32.F16 load3A3, load3A3; + +10:-:-:-:1 F2F.F32.F16 loadB0, loadB0; +--:-:-:-:1 F2F.F32.F16 loadB1, loadB1; +--:-:-:-:1 F2F.F32.F16 loadB2, loadB2; +--:-:5:-:1 F2F.F32.F16 loadB3, loadB3; + }; + + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 16, P1; + +01:-:-:-:1 STS.128 [writeAs + 4x<0*128>], load0A; +--:-:-:-:6 IADD track0A0.CC, track0A0, lda16; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS.128 [writeAs + 4x<4*128>], load1A; +--:-:-:-:6 IADD track1A0.CC, track1A0, lda16; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS.128 [writeAs + 4x<8*128>], load2A; +--:-:-:-:6 IADD track2A0.CC, track2A0, lda16; +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS.128 [writeAs + 4x<12*128>], load3A; +--:-:-:-:6 IADD track3A0.CC, track3A0, lda16; +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P5 LDG.E.CI.64 load0A, [track0A]; +--:-:4:-:1 @P5 LDG.E.CI.64 load1A, [track1A]; +--:-:5:-:1 @P5 LDG.E.CI.64 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.CI.64 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.64 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P5 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:3:-:1 @P5 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:3:-:1 @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:3:-:1 @P5 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:4:-:1 @P5 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:4:-:1 @P5 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:4:-:1 @P5 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:4:-:1 @P5 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load2A0, [track2A + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2A1, [track2A + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2A2, [track2A + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2A3, [track2A + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>]; + }; + + + + our $vec; + our $shiftAX = 0; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n", + j9c6 => "10:5:-:-:1 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n", + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, lda16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, lda16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2A0.CC, track2A0, lda16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, lda16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.64 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.64 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.64 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.64 loadB, [trackB];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0A3, load0A1.H1;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A2, load0A1.H0;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A1, load0A0.H1;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1A3, load1A1.H1;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A2, load1A1.H0;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A1, load1A0.H1;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1A0, load1A0.H0;\n", + + j6c13 => "10:-:-:-:1 \@P5 F2F.F32.F16 load2A3, load2A1.H1;\n", + j6c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2A2, load2A1.H0;\n", + j6c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2A1, load2A0.H1;\n", + j6c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load2A0, load2A0.H0;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A3, load3A1.H1;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A2, load3A1.H0;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A1, load3A0.H1;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3A0, load3A0.H0;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n", + ) : + ( + + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0A0, load0A0;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A1, load0A1;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A2, load0A2;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0A3, load0A3;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1A0, load1A0;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A1, load1A1;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A2, load1A2;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1A3, load1A3;\n", + + j6c13 => "10:-:-:-:1 \@P5 F2F.F32.F16 load2A0, load2A0;\n", + j6c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2A1, load2A1;\n", + j6c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2A2, load2A2;\n", + j6c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load2A3, load2A3;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A0, load3A0;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A1, load3A1;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A2, load3A2;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3A3, load3A3;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB3;\n", + j10c17 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB2, loadB2;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB0, loadB0;\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/hgemm_tn_128x64.sass b/Kernel/SGEMM/Maxwell/hgemm_tn_128x64.sass new file mode 100644 index 0000000..0404ab5 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/hgemm_tn_128x64.sass @@ -0,0 +1,389 @@ +# Kernel: hgemm_tn_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 64*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ tid, blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, ta, tb, tid1, tid15, tidX, x<1-3|65-67>, y<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-107 : loadA<0-7>, loadB<0-3> + 108-111 : trackA<0-1>, trackB<0-1> + + 112-122 ~ writeAs, writeBs, k, txa00, txa64, txb, tidY, swapBuf + 123-127 : readAs, readBs + + 64-83 ~ ldc, ldcz, ci, xmad_c, threadId, tid31, tid96, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C04y<0-1>, C08y<0-1>, C12y<0-1>, C00y<0-1> + 86-107 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +// tidX = (tid & 15) << 2 +// tidY = (tid >> 4) & 7 +01:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidX, tid15, 2; +--:-:-:-:1 BFE.U32 tidY, tid, 0x304; // 3 bits at position 4 + +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 4; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// trackA += (blkA*128 + lda*tidY + tidX) * 2 +02:-:-:-:1 ISCADD txa00, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa00; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x1; +--:-:-:-:1 IADD txa64, txa00, 64; + +// trackB += (blkB*64 + ldb*tidY + tidX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x1; + +// Start the write buffers high +// writeAs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2; +// writeBs = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2; + +// Start the read buffers low +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x<64*8 + 128*8>; + + +REMAINDER: + + + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + + + our $vec; + return $vec ? q{ +// doLoad = tidY < k && txa00|txb < n|m +--:-:-:-:1 ISETP.LT.AND P4, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY, k, P6; + + +--:-:2:-:1 @P4 LDG.E.CI.64 loadA0, [trackA + 2x<00>]; +--:-:3:-:1 @P5 LDG.E.CI.64 loadA4, [trackA + 2x<64>]; +--:-:4:-:1 @P6 LDG.E.CI.64 loadB0, [trackB]; + +--:-:5:-:1 @!P4 LDS.U.64 loadA0, [addr_zero]; +--:-:5:-:1 @!P5 LDS.U.64 loadA4, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.64 loadB0, [addr_zero]; + + + } : q{ +// doLoadA = tidY < k && txa00 < m +// doLoadB = tidY < k && txb < n +--:-:-:-:1 IADD x1, txa00, 1; +--:-:-:-:1 IADD x2, txa00, 2; +--:-:-:-:1 IADD x3, txa00, 3; +--:-:-:-:1 IADD x65, txa64, 1; +--:-:-:-:1 IADD x66, txa64, 2; +--:-:-:-:1 IADD x67, txa64, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_m, P0; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<00 + 0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<00 + 1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<00 + 2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x65, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x66, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x67, param_m, P0; + +--:-:3:-:1 @P0 LDG.E.CI.S16 loadA4, [trackA + 2x<00 + 64>]; +--:-:3:-:1 @P1 LDG.E.CI.S16 loadA5, [trackA + 2x<00 + 65>]; +--:-:3:-:1 @P2 LDG.E.CI.S16 loadA6, [trackA + 2x<00 + 66>]; +--:-:3:-:1 @P3 LDG.E.CI.S16 loadA7, [trackA + 2x<00 + 67>]; + +--:-:-:-:1 @!P0 MOV loadA4, RZ; +--:-:-:-:1 @!P1 MOV loadA5, RZ; +--:-:-:-:1 @!P2 MOV loadA6, RZ; +--:-:-:-:1 @!P3 MOV loadA7, RZ; + +--:-:-:-:1 IADD y1, txb, 1; +--:-:-:-:1 IADD y2, txb, 2; +--:-:-:-:1 IADD y3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_n, P0; + +--:-:4:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>]; +--:-:4:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>]; +--:-:4:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>]; +--:-:4:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + }; + + + + + + our $vec; + return $vec ? q{ +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 7; + +12:-:-:-:4 F2F.F32.F16 loadA3, loadA1.H1; +--:-:-:-:4 F2F.F32.F16 loadA2, loadA1.H0; +--:-:-:-:4 F2F.F32.F16 loadA1, loadA0.H1; +--:-:2:-:4 F2F.F32.F16 loadA0, loadA0.H0; + +04:-:-:-:4 F2F.F32.F16 loadA7, loadA5.H1; +--:-:-:-:0 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:4 F2F.F32.F16 loadA6, loadA5.H0; +--:-:-:-:4 F2F.F32.F16 loadA5, loadA4.H1; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; +--:-:3:-:1 F2F.F32.F16 loadA4, loadA4.H0; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; + +02:-:-:-:1 STS.128 [writeAs + 4x<00>], loadA0; +04:-:-:-:1 STS.128 [writeAs + 4x<64>], loadA4; + +28:-:-:-:4 F2F.F32.F16 loadB3, loadB1.H1; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 F2F.F32.F16 loadB2, loadB1.H0; +--:-:-:-:4 F2F.F32.F16 loadB1, loadB0.H1; +--:-:2:-:2 F2F.F32.F16 loadB0, loadB0.H0; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +02:-:-:-:1 STS.128 [writeBs], loadB0; + + // scalar loads + } : q{ +// bDoRemainder = k > 8 +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +02:-:-:-:4 F2F.F32.F16 loadA0, loadA0; +--:-:-:-:4 F2F.F32.F16 loadA1, loadA1; +--:-:-:-:4 F2F.F32.F16 loadA2, loadA2; +--:-:2:-:4 F2F.F32.F16 loadA3, loadA3; + +04:-:-:-:4 F2F.F32.F16 loadA4, loadA4; +--:-:-:-:0 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:4 F2F.F32.F16 loadA5, loadA5; +--:-:-:-:4 F2F.F32.F16 loadA6, loadA6; +--:-:3:-:1 F2F.F32.F16 loadA7, loadA7; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS.128 [writeAs + 4x<00>], loadA0; +04:-:-:-:1 STS.128 [writeAs + 4x<64>], loadA4; + +08:-:-:-:4 F2F.F32.F16 loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 F2F.F32.F16 loadB1, loadB1; +--:-:-:-:4 F2F.F32.F16 loadB2, loadB2; +--:-:2:-:2 F2F.F32.F16 loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +02:-:-:-:1 STS.128 [writeBs], loadB0; + + }; + + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P4, PT, k, $k_end, P4;\n"); + + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, $k_end, P5;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, $k_end, P6;\n", + j0c5 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c7 => "--:-:-:-:1 IADD32I k, k, -8;\n", + + ($vec ? + ( + j0c8 => "--:-:2:-:1 \@P4 LDG.E.CI.64 loadA0, [trackA + 2x<00>];\n", + j0c11 => "--:-:3:-:1 \@P5 LDG.E.CI.64 loadA4, [trackA + 2x<64>];\n", + j0c14 => "--:-:4:-:1 \@P6 LDG.E.CI.64 loadB0, [trackB];\n", + + j4c3 => "02:-:-:-:1 \@P4 F2F.F32.F16 loadA3, loadA1.H1;\n", + j4c7 => "--:-:-:-:1 \@P4 F2F.F32.F16 loadA2, loadA1.H0;\n", + j4c11 => "--:-:-:-:1 \@P4 F2F.F32.F16 loadA1, loadA0.H1;\n", + j4c15 => "--:-:2:-:1 \@P4 F2F.F32.F16 loadA0, loadA0.H0;\n", + + j5c3 => "04:-:-:-:1 \@P5 F2F.F32.F16 loadA7, loadA5.H1;\n", + j5c7 => "--:-:-:-:1 \@P5 F2F.F32.F16 loadA6, loadA5.H0;\n", + j5c11 => "--:-:-:-:1 \@P5 F2F.F32.F16 loadA5, loadA4.H1;\n", + j5c15 => "--:-:3:-:1 \@P5 F2F.F32.F16 loadA4, loadA4.H0;\n", + + j6c3 => "08:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n", + j6c7 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n", + j6c11 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n", + j6c15 => "--:-:4:-:1 \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n", + ) : + ( + j0c10 => "--:-:2:-:1 \@P4 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n", + j0c12 => "--:-:2:-:1 \@P4 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n", + j0c14 => "--:-:2:-:1 \@P4 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n", + j0c16 => "--:-:2:-:1 \@P4 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n", + + j0c33 => "--:-:3:-:1 \@P5 LDG.E.CI.S16 loadA4, [trackA + 2x<64>];\n", + j0c35 => "--:-:3:-:1 \@P5 LDG.E.CI.S16 loadA5, [trackA + 2x<65>];\n", + j0c37 => "--:-:3:-:1 \@P5 LDG.E.CI.S16 loadA6, [trackA + 2x<66>];\n", + j0c39 => "--:-:3:-:1 \@P5 LDG.E.CI.S16 loadA7, [trackA + 2x<67>];\n", + + j1c10 => "--:-:4:-:1 \@P6 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j1c12 => "--:-:4:-:1 \@P6 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j1c14 => "--:-:4:-:1 \@P6 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j1c16 => "--:-:4:-:1 \@P6 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j4c3 => "02:-:-:-:1 \@P4 F2F.F32.F16 loadA0, loadA0;\n", + j4c7 => "--:-:-:-:1 \@P4 F2F.F32.F16 loadA1, loadA1;\n", + j4c11 => "--:-:-:-:1 \@P4 F2F.F32.F16 loadA2, loadA2;\n", + j4c15 => "--:-:2:-:1 \@P4 F2F.F32.F16 loadA3, loadA3;\n", + + j5c3 => "04:-:-:-:1 \@P5 F2F.F32.F16 loadA4, loadA4;\n", + j5c7 => "--:-:-:-:1 \@P5 F2F.F32.F16 loadA5, loadA5;\n", + j5c11 => "--:-:-:-:1 \@P5 F2F.F32.F16 loadA6, loadA6;\n", + j5c15 => "--:-:3:-:1 \@P5 F2F.F32.F16 loadA7, loadA7;\n", + + j6c3 => "08:-:-:-:1 \@P6 F2F.F32.F16 loadB0, loadB0;\n", + j6c7 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB1;\n", + j6c11 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB2;\n", + j6c15 => "--:-:4:-:1 \@P6 F2F.F32.F16 loadB3, loadB3;\n", + ) + ), + + j4c31 => "02:-:-:-:1 \@P0 STS.128 [writeAs + 4x<00>], loadA0;\n", + j5c31 => "04:-:-:-:1 \@P0 STS.128 [writeAs + 4x<64>], loadA4;\n", + + j5c46 => "--:-:-:-:1 \@P0 IADD trackA0.CC, trackA0, param_lda8;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X trackA1, trackA1, RZ;\n", + + j6c31 => "08:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c46 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c54 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_common_128x128.sass b/Kernel/SGEMM/Maxwell/sgemm_common_128x128.sass new file mode 100644 index 0000000..703af8f --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_common_128x128.sass @@ -0,0 +1,309 @@ +# sgemm_common_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? '01' : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:-:-:1 IADD loop, loop, 1; +--:-:-:-:1 IADD ta, ta, param_ldaz; +--:-:-:-:1 IADD tb, tb, param_ldbz; +--:-:-:-:3 MOV k, param_k; +--:-:-:-:1 ISETP.LT.AND P1, PT, loop, param_loops, PT; +--:-:-:-:6 LEA trackA0.CC, ta, param_A[0], 2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 2; +--:-:-:-:6 LEA trackB0.CC, tb, param_B[0], 2; +--:-:-:-:0 LEA.HI.X trackB1, tb, param_B[1], RZ, 2; +--:-:-:Y:5 @P1 BRA.U REMAINDER; + +--:-:1:-:1 S2R blockA, SR_CTAID.Y; +--:-:2:-:1 S2R blockB, SR_CTAID.Z; +--:-:3:-:1 S2R blockZ, SR_CTAID.X; + + +--:-:-:-:1 LOP.AND tid_31, tid, 31; +--:-:-:-:1 LOP.AND tid_96, tid, 96; +--:-:-:-:1 LOP.AND tid_128, tid, 128; + +// writeCs = (readAs / 4) * 128 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0xfff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xfff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 5; + +// cx = tid_31 | (tid_128 >> 2); +--:-:-:-:1 SHR.U32 cx00, tid_128, 2; +--:-:-:-:1 LOP.OR cx00, tid_31, cx00; + +// readCs = ((tid_96 << 4) | cx) << 2; +--:-:-:-:1 SHL readCs, tid_96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, cx00; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx += blockB*128; +02:-:-:-:1 ISCADD cx00, blockB, cx00, 7; +--:-:-:-:1 IADD cx64, cx00, 64; + +// cy = blockA*128 + (tid_96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid_96, 1; +01:-:-:-:1 ISCADD cy00, blockA, cy00, 7; + +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx00, xmad_c; +04:-:-:-:1 XMAD.LO2 ci, ldcz, blockZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// Apply beta +--:-:-:-:1 ISETP.NE.AND P6, PT, beta, RZ, PT; + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y + 4x<00>]; +--:-:2:-:1 @P1 LDG.E d1, [C00y + 4x<64>]; +--:-:3:-:1 @P2 LDG.E d2, [C04y + 4x<00>]; +--:-:4:-:1 @P3 LDG.E d3, [C04y + 4x<64>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c4, c4, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c5, c5, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c6, c6, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c7, c7, RZ, !PT; + +// beta != 0 +--:-:-:-:7 ISETP.NE.AND P6, PT, beta, RZ, PT; + + +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], c4; +--:-:-:-:1 LDS c0, [readCs + 4x<0*128 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<0*128 + 64>]; + +--:-:-:-:1 LDS c2, [readCs + 4x<1*128 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<1*128 + 64>]; + + + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:1 @P6 FFMA c3, d3, beta, c3; + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +--:-:-:-:1 @P0 STG.E.CG [C00y0 + 4x<00>], c0; +--:5:-:-:1 @P1 STG.E.CG [C00y0 + 4x<64>], c1; +--:-:-:-:1 @P2 STG.E.CG [C04y0 + 4x<00>], c2; +--:6:-:-:1 @P3 STG.E.CG [C04y0 + 4x<64>], c3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C08y0 + 4x<00>]; +--:-:2:-:1 @P1 LDG.E d1, [C08y0 + 4x<64>]; +--:-:3:-:1 @P2 LDG.E d2, [C12y0 + 4x<00>]; +--:-:4:-:1 @P3 LDG.E d3, [C12y0 + 4x<64>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +--:-:-:-:2 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:2 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + + +10:-:-:-:4 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +20:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:0 IADD.X C04y1, C04y1, RZ; + +--:-:-:-:1 LDS c0, [readCs + 4x<2*128 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<2*128 + 64>]; +--:-:-:-:1 LDS c2, [readCs + 4x<3*128 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*128 + 64>]; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P6 FFMA c3, d3, beta, c3; + +01:-:-:-:1 @P0 STG.E.CG [C08y0 + 4x<00>], c0; +02:5:-:-:1 @P1 STG.E.CG [C08y0 + 4x<64>], c1; +04:-:-:-:1 @P2 STG.E.CG [C12y0 + 4x<00>], c2; +08:6:-:-:1 @P3 STG.E.CG [C12y0 + 4x<64>], c3; + +10:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +20:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/sgemm_common_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_common_128x32.sass new file mode 100644 index 0000000..928ad6b --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_common_128x32.sass @@ -0,0 +1,240 @@ +# sgemm_common_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 32 + readBs; +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 2; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +--:-:-:-:1 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + +// Apply relu +--:-:-:-:0 LOP.AND.NZ P4, RZ, flags, 2; +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + + +--:-:-:-:4 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 MOV d0, RZ; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:4 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 MOV d1, RZ; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:3 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 MOV d2, RZ; +--:-:-:-:1 MOV d3, RZ; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y]; +--:-:2:-:1 @P1 LDG.E d1, [C04y]; +--:-:3:-:1 @P2 LDG.E d2, [C08y]; +--:-:4:-:1 @P3 LDG.E d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:3 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*32>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*32>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*32>]; + + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P5 FFMA c3, d3, beta, c3; + +--:1:-:-:1 @P0 STG.E.CG [C00y], c0; +--:2:-:-:1 @P1 STG.E.CG [C04y], c1; +--:3:-:-:1 @P2 STG.E.CG [C08y], c2; +--:4:-:-:1 @P3 STG.E.CG [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/sgemm_common_128x64.sass b/Kernel/SGEMM/Maxwell/sgemm_common_128x64.sass new file mode 100644 index 0000000..ee1705e --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_common_128x64.sass @@ -0,0 +1,290 @@ +# sgemm_common_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? '01' : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:1:-:1 S2R tid_2, SR_TID.X; +--:-:2:-:1 S2R blockA, SR_CTAID.Y; +--:-:3:-:1 S2R blockB, SR_CTAID.Z; +--:-:4:-:1 S2R blockZ, SR_CTAID.X; + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 64 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0xff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 4; + +// readCs = ((tid_2 & 96) << 3) | (tid_2 & 31) << 2; +01:-:-:-:1 LOP.AND tid31, tid_2, 31; +01:-:-:-:1 LOP.AND tid96, tid_2, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx00 = blockB*64 + tid31; +04:-:-:-:1 ISCADD cx00, blockB, tid31, 6; +--:-:-:-:1 IADD cx32, cx00, 32; + +// cy = blockA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +02:-:-:-:1 ISCADD cy00, blockA, cy00, 7; + +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx00, xmad_c; +08:-:-:-:1 XMAD.LO2 ci, ldcz, blockZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + + +--:-:-:-:1 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y0 + 4x<00>]; +--:-:2:-:1 @P1 LDG.E d1, [C00y0 + 4x<32>]; +--:-:3:-:1 @P2 LDG.E d2, [C04y0 + 4x<00>]; +--:-:4:-:1 @P3 LDG.E d3, [C04y0 + 4x<32>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c4, c4, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c5, c5, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c6, c6, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c7, c7, RZ, !PT; + +--:-:-:-:7 ISETP.NE.AND P6, PT, beta, RZ, PT; + + +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], c4; + +--:-:-:-:1 LDS c0, [readCs + 4x<0*64 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<1*64 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<1*64 + 32>]; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:1 @P6 FFMA c3, d3, beta, c3; + + +--:-:-:-:1 @P0 STG.E.CS [C00y0 + 4x<00>], c0; +--:5:-:-:1 @P1 STG.E.CS [C00y0 + 4x<32>], c1; +--:-:-:-:1 @P2 STG.E.CS [C04y0 + 4x<00>], c2; +--:6:-:-:1 @P3 STG.E.CS [C04y0 + 4x<32>], c3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C08y0 + 4x<00>]; +--:-:2:-:1 @P1 LDG.E d1, [C08y0 + 4x<32>]; +--:-:3:-:1 @P2 LDG.E d2, [C12y0 + 4x<00>]; +--:-:4:-:1 @P3 LDG.E d3, [C12y0 + 4x<32>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + + +10:-:-:-:2 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +20:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:0 IADD.X C04y1, C04y1, RZ; + +--:-:-:-:1 LDS c0, [readCs + 4x<2*64 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<3*64 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*64 + 32>]; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:1 @P6 FFMA c3, d3, beta, c3; + +--:-:-:-:1 @P0 STG.E.CS [C08y0 + 4x<00>], c0; +--:5:-:-:1 @P1 STG.E.CS [C08y0 + 4x<32>], c1; +--:-:-:-:1 @P2 STG.E.CS [C12y0 + 4x<00>], c2; +--:6:-:-:1 @P3 STG.E.CS [C12y0 + 4x<32>], c3; + +10:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +20:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/sgemm_common_32x128.sass b/Kernel/SGEMM/Maxwell/sgemm_common_32x128.sass new file mode 100644 index 0000000..da4d83d --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_common_32x128.sass @@ -0,0 +1,234 @@ +# Kernel: hgemm_common_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*32 + 16 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 128 + readBs; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 5; + +// readCs = tid * 4; +--:-:-:-:1 SHL readCs, tid, 2; + +// cx = blkB*128 + tid; +--:-:-:-:1 ISCADD cx, blkB, tid, 7; + +// cy = blkA*32 +--:-:-:-:1 SHL cy00, blkA, 5; + +// C += (cy*ldc + cx) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; + +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; + +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P4, RZ, flags, 2; + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc12, ldc, -ldc4, 6; + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc12;\n" . + "--:-:-:-:1 IADD cy00, cy00, 12;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc12;\n" . + "--:-:-:-:1 IADD cy04, cy04, 12;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc12;\n" . + "--:-:-:-:1 IADD cy08, cy08, 12;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc12;\n" . + "--:-:-:-:1 IADD cy12, cy12, 12;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y]; +--:-:2:-:1 @P1 LDG.E d1, [C04y]; +--:-:3:-:1 @P2 LDG.E d2, [C08y]; +--:-:4:-:1 @P3 LDG.E d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:3 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*128>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*128>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*128>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*128>]; + + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P5 FFMA c3, d3, beta, c3; + +--:1:-:-:1 @P0 STG.E.CG [C00y], c0; +--:2:-:-:1 @P1 STG.E.CG [C04y], c1; +--:3:-:-:1 @P2 STG.E.CG [C08y], c2; +--:4:-:-:1 @P3 STG.E.CG [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_128x128.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_128x128.sass new file mode 100644 index 0000000..22b8782 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_nn_128x128.sass @@ -0,0 +1,327 @@ +# Kernel: sgemm_nn_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, xmad_ta, xmad_tb, tid31, tid128 + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 ~ k<1-3>, x<1-3> + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-107 : loadA<0-7>, loadB<0-3> + + 108-111 : trackA<0-1>, trackB<0-1> + + 112-121 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop + 122-127 ~ readAs, readBs, tid + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 MOV loop, RZ; +--:-:-:-:1 STS.128 [addr_zero], RZ; + + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); + + +// tidAY = (tid & 1) << 2 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; + +// tidAX = tid >> 1 +--:-:-:-:1 SHR.U32 tidAX, tid, 1; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) & 7 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL tidBX, tid31, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x305; // 3 bits at position 5 + +// trackB += (blkB*128 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 7; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeAs = 4 * (128 * tidAY + tidAX) +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<128*8*2>, 2; + +// writeBs = (128*tidBY + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 7; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<128*8*3>, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + + + our $vec; + return $vec ? q{ + +// k must be multiple of 8 +--:-:1:-:1 @P6 LDG.E.CI.128 loadB0, [trackB]; + +--:-:2:-:1 @P5 LDG.E.CI.128 loadA0, [trackA + 4x<0>]; +--:5:6:-:1 @P5 LDG.E.CI.128 loadA4, [trackA + 4x<8>]; + +--:-:3:-:1 @!P6 LDS.U.128 loadB0, [addr_zero]; +--:-:4:-:1 @!P5 LDS.U.128 loadA0, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.128 loadA4, [addr_zero]; + +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +05:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +0a:-:-:-:1 STS [writeAs + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeAs + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*128>], loadA3; + +10:-:-:-:6 IADD trackA0.CC, trackA0, 4x<16>; +--:-:-:-:1 IADD.X trackA1, trackA1, RZ; + + } : q{ + + + +// doLoad0 = tidBY < k +--:-:-:-:1 IADD x1, txb, 1; +--:-:-:-:1 IADD x2, txb, 2; +--:-:-:-:1 IADD x3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_n, P0; + +--:-:6:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + +--:-:-:-:1 IADD k1, tidAY, 1; +--:-:-:-:1 IADD k2, tidAY, 2; +--:-:-:-:1 IADD k3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + + + +// bDoRemainder = k > 8 +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +20:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128>], loadA0; +04:-:-:-:1 STS [writeAs + 4x<1*128>], loadA1; +08:-:-:-:1 STS [writeAs + 4x<2*128>], loadA2; +10:-:-:-:1 STS [writeAs + 4x<3*128>], loadA3; + +--:-:-:-:6 IADD trackA0.CC, trackA0, 4x<8>; +--:-:-:-:1 IADD.X trackA1, trackA1, RZ; + }; + + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 LOP.XOR writeAs, writeAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR writeBs, writeBs, 4x<128*8*2>; + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c15 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P5;\n", + + j0c10 => "--:-:2:-:1 \@P3 LDG.E.CI.128 loadB0, [trackB];\n", + + j0c28 => "--:-:3:-:1 \@P2 LDG.E.CI.128 loadA0, [trackA + 4x<0>];\n", + j0c30 => "20:5:6:-:1 \@P2 LDG.E.CI.128 loadA4, [trackA + 4x<8>];\n", + + j4c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128>], loadA4;\n", + j4c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128>], loadA5;\n", + j4c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128>], loadA6;\n", + j4c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128>], loadA7;\n", + + j5c35 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c29 => "04:-:-:-:1 \@P1 STS [writeAs + 4x<0*128>], loadA0;\n", + j6c31 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<1*128>], loadA1;\n", + j6c33 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<2*128>], loadA2;\n", + j6c35 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<3*128>], loadA3;\n", + + j6c46 => "10:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 4x<16>;\n", + j6c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + j0c10 => "--:-:6:-:1 \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j0c29 => "--:-:6:-:1 \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j0c31 => "--:-:6:-:1 \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j0c33 => "--:-:6:-:1 \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + + j0c35 => "--:-:2:-:1 \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j1c29 => "--:-:3:-:1 \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j1c31 => "--:-:4:-:1 \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j1c33 => "--:-:5:-:1 \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + + j5c39 => "20:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n", + j6c31 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n", + j6c33 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n", + j6c35 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n", + + j6c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n", + j6c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ) + ), + + j5c46 => "--:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + j5c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:0 IADD32I k, k, -8;\n" . + "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_128x32.sass new file mode 100644 index 0000000..8194777 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_nn_128x32.sass @@ -0,0 +1,485 @@ +# Kernel: sgemm_nn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + 32*16*2> + szShareA : (128*16 + 32) + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, shiftAX + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb16, ldb, 6; +--:-:-:-:1 SHL lda32, lda, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa00, blkA, tidAX, 7; +--:-:-:-:1 IADD txa32, txa00, 32; +--:-:-:-:1 IADD txa64, txa00, 64; +--:-:-:-:1 IADD txa96, txa00, 96; + +--:-:-:-:1 XMAD.LO ta00, lda, txa00, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta00, ldaz, blkZ, ta00; +--:-:-:-:1 IADD ta32, ta00, lda32; +--:-:-:-:1 IADD ta64, ta32, lda32; +--:-:-:-:1 IADD ta96, ta64, lda32; + +--:-:-:-:1 LEA track0A0.CC, ta00, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta00, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, ta32, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track1A1, ta32, param_A[1], RZ, 2; +--:-:-:-:1 LEA track2A0.CC, ta64, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track2A1, ta64, param_A[1], RZ, 2; +--:-:-:-:1 LEA track3A0.CC, ta96, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track3A1, ta96, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 2; + +// writeAs = (tidAY*128 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:2:-:1 @P3 LDG.E.CI.128 load1A, [track1A]; +--:-:3:-:1 @P4 LDG.E.CI.128 load2A, [track2A]; +--:-:4:-:1 @P5 LDG.E.CI.128 load3A, [track3A]; +--:-:5:-:1 @P6 LDG.E.CI.128 loadB, [trackB]; + + + +--:-:6:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load1A, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 load2A, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 load3A, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.128 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:1:-:1 @P0 LDG.E.CI load0A0, [track0A + 4x<0>]; +--:-:1:-:1 @P1 LDG.E.CI load0A1, [track0A + 4x<1>]; +--:-:1:-:1 @P2 LDG.E.CI load0A2, [track0A + 4x<2>]; +--:-:1:-:1 @P3 LDG.E.CI load0A3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI load1A0, [track1A + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI load1A1, [track1A + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI load1A2, [track1A + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI load1A3, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI load2A0, [track2A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI load2A1, [track2A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load2A2, [track2A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI load2A3, [track2A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI load3A0, [track3A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI load3A1, [track3A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI load3A2, [track3A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load3A3, [track3A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:5:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:5:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:5:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P1, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + + + +21:-:-:-:1 STS [writeAs + 4x<0*128 + 0*32>], load0A0; +--:-:-:-:0 IADD track0A0.CC, track0A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 0*32>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 0*32>], load0A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 0*32>], load0A3; + +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128 + 1*32>], load1A0; +--:-:-:-:0 IADD track1A0.CC, track1A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 1*32>], load1A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 1*32>], load1A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 1*32>], load1A3; + +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 2*32>], load2A0; +--:-:-:-:0 IADD track2A0.CC, track2A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 2*32>], load2A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 2*32>], load2A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 2*32>], load2A3; + +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 3*32>], load3A0; +--:-:-:-:0 IADD track3A0.CC, track3A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 3*32>], load3A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 3*32>], load3A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 3*32>], load3A3; + +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:4:-:1 @P3 LDG.E.CI.128 load1A, [track1A]; +--:-:5:-:1 @P4 LDG.E.CI.128 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.CI.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.128 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI load0A0, [track0A + 4x<0>]; +--:-:3:-:1 @P2 LDG.E.CI load0A1, [track0A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load0A2, [track0A + 4x<2>]; +--:-:3:-:1 @P2 LDG.E.CI load0A3, [track0A + 4x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI load1A0, [track1A + 4x<0>]; +--:-:4:-:1 @P3 LDG.E.CI load1A1, [track1A + 4x<1>]; +--:-:4:-:1 @P3 LDG.E.CI load1A2, [track1A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load1A3, [track1A + 4x<3>]; + +--:-:5:-:1 @P4 LDG.E.CI load2A0, [track2A + 4x<0>]; +--:-:5:-:1 @P4 LDG.E.CI load2A1, [track2A + 4x<1>]; +--:-:5:-:1 @P4 LDG.E.CI load2A2, [track2A + 4x<2>]; +--:-:5:-:1 @P4 LDG.E.CI load2A3, [track2A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load3A0, [track3A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load3A1, [track3A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load3A2, [track3A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load3A3, [track3A + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P6 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P6 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P6 LDG.E.CI loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n", + + j9c6 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n", + + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 4x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 4x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2A0.CC, track2A0, 4x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, 4x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.128 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI.128 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.128 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI load2A0, [track2A + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P4 LDG.E.CI load2A1, [track2A + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P4 LDG.E.CI load2A2, [track2A + 4x<2>];\n", + j10c3 => "--:-:5:-:1 \@P4 LDG.E.CI load2A3, [track2A + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_128x64.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_128x64.sass new file mode 100644 index 0000000..2fca939 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_nn_128x64.sass @@ -0,0 +1,414 @@ +# Kernel: sgemm_nn_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 64*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ tid, blkA, blkB, blkZ, txb, tidAY, tidBY, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, ta, xmad_ta, tb, tid15, xmad_tb, k<1-3>, x<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-115 : loadAA<0-7>, loadA<0-7>, loadB<0-3> + + 116-121 : track0A<0-1>, track1A<0-1>, trackB<0-1> + + 122-125 ~ writeAs, writeBs, k, swapBuf + 126-127 ~ readAs, readBs + + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-125 ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +// tidAY = (tid & 1) << 2 +// tidAX = tid >> 1 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; +01:-:-:-:1 SHR.U32 tidAX, tid, 1; + +// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, lda, track0A0, 8; +--:-:-:-:1 LEA.HI.X track1A1, lda, track0A1, RZ, 8; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa, param_m, PT; +--:-:-:-:1 IADD txa, txa, 64; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// tidBX = (tid & 15) << 2 +// tidBY = (tid >> 4) & 7 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidBX, tid15, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4 + +// trackB += (blkB*64 + tidX + ldb*tidBY) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:2 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// Start the write buffers high +// writeAs = (128*tidAY + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2; + +// writeBs = (64*tidBY + tidX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2; + +// Start the read buffers low +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x<64*8 + 128*8>; + + +REMAINDER: + + + our $vec; + return $vec ? q{ + +// k must be multiple of 8 +--:-:2:-:1 @P6 LDG.E.CI.128 loadB0, [trackB]; + +--:-:3:-:1 @P4 LDG.E.CI.128 loadA0, [track0A + 4x<0>]; +--:-:3:-:1 @P4 LDG.E.CI.128 loadAA0, [track0A + 4x<8>]; + +--:-:4:-:1 @P5 LDG.E.CI.128 loadA4, [track1A + 4x<0>]; +--:-:4:-:1 @P5 LDG.E.CI.128 loadAA4, [track1A + 4x<8>]; + +--:-:-:-:1 @!P6 LDS.U.128 loadB0, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 loadA0, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 loadA4, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 loadAA0, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.128 loadAA4, [addr_zero]; + +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +22:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 00>], loadA0; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 00>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 00>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*128 + 00>], loadA3; + +--:-:-:-:6 IADD track0A0.CC, track0A0, 4x<16>; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 64>], loadA4; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 64>], loadA5; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 64>], loadA6; +--:-:-:-:1 STS [writeAs + 4x<3*128 + 64>], loadA7; + +--:-:-:-:6 IADD track1A0.CC, track1A0, 4x<16>; +--:-:-:-:1 IADD.X track1A1, track1A1, RZ; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + + } : q{ + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; + +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidBX, tid15, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4 +02:-:-:-:1 ISCADD txb, blkB, tidBX, 6; + +// doLoad0 = tidBY < k +--:-:-:-:1 IADD x1, txb, 1; +--:-:-:-:1 IADD x2, txb, 2; +--:-:-:-:1 IADD x3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_n, P0; + +--:-:2:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 IADD k1, tidAY, 1; +--:-:-:-:1 IADD k2, tidAY, 2; +--:-:-:-:1 IADD k3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI loadA0, [track0A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI loadA1, [track0A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI loadA2, [track0A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI loadA3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI loadA4, [track1A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI loadA5, [track1A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI loadA6, [track1A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI loadA7, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA4, RZ; +--:-:-:-:1 @!P1 MOV loadA5, RZ; +--:-:-:-:1 @!P2 MOV loadA6, RZ; +--:-:-:-:1 @!P3 MOV loadA7, RZ; + + +02:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 00>], loadA0; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 00>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 00>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*128 + 00>], loadA3; + +--:-:-:-:6 IADD track0A0.CC, track0A0, 4x<8>; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 64>], loadA4; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 64>], loadA5; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 64>], loadA6; +--:-:-:-:1 STS [writeAs + 4x<3*128 + 64>], loadA7; + +--:-:-:-:6 IADD track1A0.CC, track1A0, 4x<8>; +--:-:-:-:1 IADD.X track1A1, track1A1, RZ; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + + }; + + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, P6;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + + j0c11 => "--:-:2:-:1 \@P0 LDG.E.CI.128 loadB0, [trackB];\n", + + j0c12 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c13 => "--:-:-:-:1 IADD32I k, k, -8;\n", + + j0c23 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P4;\n", + j0c24 => "--:-:-:-:1 PSETP.AND.AND P3, PT, P0, P1, P5;\n", + + j0c35 => "--:-:3:-:1 \@P2 LDG.E.CI.128 loadA0, [track0A + 4x<0>];\n", + j0c37 => "--:-:3:-:1 \@P2 LDG.E.CI.128 loadAA0, [track0A + 4x<8>];\n", + + j0c39 => "--:-:4:-:1 \@P3 LDG.E.CI.128 loadA4, [track1A + 4x<0>];\n", + j0c41 => "10:6:5:-:1 \@P3 LDG.E.CI.128 loadAA4, [track1A + 4x<8>];\n", + + j2c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 00>], loadAA0;\n", + j2c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 00>], loadAA1;\n", + j2c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 00>], loadAA2;\n", + j2c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 00>], loadAA3;\n", + + j3c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 64>], loadAA4;\n", + j3c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 64>], loadAA5;\n", + j3c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 64>], loadAA6;\n", + j3c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 64>], loadAA7;\n", + + j5c29 => "04:-:-:-:1 \@P1 STS [writeAs + 4x<0*128 + 00>], loadA0;\n", + j5c31 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<1*128 + 00>], loadA1;\n", + j5c33 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<2*128 + 00>], loadA2;\n", + j5c35 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<3*128 + 00>], loadA3;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 4x<16>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + + j6c29 => "08:-:-:-:1 \@P1 STS [writeAs + 4x<0*128 + 64>], loadA4;\n", + j6c31 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<1*128 + 64>], loadA5;\n", + j6c33 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<2*128 + 64>], loadA6;\n", + j6c35 => "--:2:-:-:1 \@P1 STS [writeAs + 4x<3*128 + 64>], loadA7;\n", + + j6c46 => "20:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 4x<16>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P4;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P5;\n", + + j0c10 => "--:-:2:-:1 \@P0 LDG.E.CS loadB0, [trackB + 4x<0>];\n", + j0c12 => "--:-:2:-:1 \@P0 LDG.E.CS loadB1, [trackB + 4x<1>];\n", + j0c14 => "--:-:2:-:1 \@P0 LDG.E.CS loadB2, [trackB + 4x<2>];\n", + j0c16 => "--:-:2:-:1 \@P0 LDG.E.CS loadB3, [trackB + 4x<3>];\n", + + j0c18 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c20 => "--:-:-:-:1 IADD32I k, k, -8;\n", + + j0c33 => "--:-:3:-:1 \@P2 LDG.E.CI loadA0, [track0A + 4x<0>];\n", + j0c35 => "--:-:3:-:1 \@P2 LDG.E.CI loadA1, [track0A + 4x<1>];\n", + j0c37 => "--:-:3:-:1 \@P2 LDG.E.CI loadA2, [track0A + 4x<2>];\n", + j0c39 => "--:-:3:-:1 \@P2 LDG.E.CI loadA3, [track0A + 4x<3>];\n", + + j1c29 => "--:-:4:-:1 \@P3 LDG.E.CI loadA4, [track1A + 4x<0>];\n", + j1c31 => "--:-:4:-:1 \@P3 LDG.E.CI loadA5, [track1A + 4x<1>];\n", + j1c33 => "--:-:4:-:1 \@P3 LDG.E.CI loadA6, [track1A + 4x<2>];\n", + j1c35 => "--:-:4:-:1 \@P3 LDG.E.CI loadA7, [track1A + 4x<3>];\n", + + j5c29 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 00>], loadA0;\n", + j5c31 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 00>], loadA1;\n", + j5c33 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 00>], loadA2;\n", + j5c35 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 00>], loadA3;\n", + + j5c46 => "--:-:-:-:1 \@P0 IADD track0A0.CC, track0A0, 4x<8>;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X track0A1, track0A1, RZ;\n", + + j6c29 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 64>], loadA4;\n", + j6c31 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 64>], loadA5;\n", + j6c33 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 64>], loadA6;\n", + j6c35 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 64>], loadA7;\n", + + j6c46 => "--:-:-:-:1 \@P0 IADD track1A0.CC, track1A0, 4x<8>;\n", + j6c54 => "--:-:-:-:1 \@P0 IADD.X track1A1, track1A1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ) + ), + + j4c21 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j4c22 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j4c27 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "02:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + + ); + return; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_32x128.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_32x128.sass new file mode 100644 index 0000000..e25c3a9 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_nn_32x128.sass @@ -0,0 +1,458 @@ +# Kernel: sgemm_nn_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + (32*16 + 32)*2> + szShareA : (32*16 + 32) + szShareB : (128*16) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadA<0-3> + 84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + + 100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; +--:-:3:-:1 S2R blkA, SR_CTAID.Y; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb4, ldb, 2; +--:-:-:-:1 SHL ldb16, ldb, 6; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) +01:-:-:-:1 LOP.AND tidBX, tid, 31; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 5; + +// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 4 +04:-:-:-:1 ISCADD txa, blkA, tidAX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 2; + +// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4 +02:-:-:-:1 ISCADD txb, blkB, tidBX, 7; +--:-:-:-:1 XMAD.LO2 tb0, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb0, ldbz, blkZ, tb0; +--:-:-:-:1 IADD tb1, tb0, ldb4; +--:-:-:-:1 IADD tb2, tb1, ldb4; +--:-:-:-:1 IADD tb3, tb2, ldb4; + +--:-:-:-:1 LEA track0B0.CC, tb0, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track0B1, tb0, param_B[1], RZ, 2; +--:-:-:-:1 LEA track1B0.CC, tb1, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track1B1, tb1, param_B[1], RZ, 2; +--:-:-:-:1 LEA track2B0.CC, tb2, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track2B1, tb2, param_B[1], RZ, 2; +--:-:-:-:1 LEA track3B0.CC, tb3, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track3B1, tb3, param_B[1], RZ, 2; + +// writeAs = (tidAY*32 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*128 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 7; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readBs, readBs, tid96; +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidBY1, tidBY, 4; +--:-:-:-:1 IADD tidBY2, tidBY, 8; +--:-:-:-:1 IADD tidBY3, tidBY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txb, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidBY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.CI.128 load0B, [track0B]; +--:-:2:-:1 @P1 LDG.E.CI.128 load1B, [track1B]; +--:-:3:-:1 @P2 LDG.E.CI.128 load2B, [track2B]; +--:-:4:-:1 @P3 LDG.E.CI.128 load3B, [track3B]; +--:-:5:-:1 @P4 LDG.E.CI.128 loadA, [trackA]; + + + +--:-:6:-:1 @!P0 LDS.U.128 load0B, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 load1B, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.128 load2B, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load3B, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 loadA, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:1:-:1 @P0 LDG.E.CI load0B0, [track0B + 4x<0>]; +--:-:1:-:1 @P1 LDG.E.CI load0B1, [track0B + 4x<1>]; +--:-:1:-:1 @P2 LDG.E.CI load0B2, [track0B + 4x<2>]; +--:-:1:-:1 @P3 LDG.E.CI load0B3, [track0B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P5; + +--:-:2:-:1 @P0 LDG.E.CI load1B0, [track1B + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI load1B1, [track1B + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI load1B2, [track1B + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI load1B3, [track1B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:3:-:1 @P0 LDG.E.CI load2B0, [track2B + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI load2B1, [track2B + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load2B2, [track2B + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI load2B3, [track2B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:4:-:1 @P0 LDG.E.CI load3B0, [track3B + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI load3B1, [track3B + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI load3B2, [track3B + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load3B3, [track3B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P6; + +--:-:5:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:5:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:5:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P0, RZ, k, 15; +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 16, P0; + + + +21:-:-:-:1 STS.128 [writeBs + 4x<0*128>], load0B; +--:-:-:-:6 IADD track0B0.CC, track0B0, ldb16; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +02:-:-:-:1 STS.128 [writeBs + 4x<4*128>], load1B; +--:-:-:-:6 IADD track1B0.CC, track1B0, ldb16; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +04:-:-:-:1 STS.128 [writeBs + 4x<8*128>], load2B; +--:-:-:-:6 IADD track2B0.CC, track2B0, ldb16; +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +08:-:-:-:1 STS.128 [writeBs + 4x<12*128>], load3B; +--:-:-:-:6 IADD track3B0.CC, track3B0, ldb16; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +10:-:-:-:1 STS [writeAs + 4x<0*32>], loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*32>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*32>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*32>], loadA3; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.128 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.128 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load2B, [track2B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load3B, [track3B]; +--:-:6:-:1 @P6 LDG.E.CI.128 loadA, [trackA]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI load0B0, [track0B + 4x<0>]; +--:-:3:-:1 @P2 LDG.E.CI load0B1, [track0B + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load0B2, [track0B + 4x<2>]; +--:-:3:-:1 @P2 LDG.E.CI load0B3, [track0B + 4x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI load1B0, [track1B + 4x<0>]; +--:-:4:-:1 @P3 LDG.E.CI load1B1, [track1B + 4x<1>]; +--:-:4:-:1 @P3 LDG.E.CI load1B2, [track1B + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load1B3, [track1B + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load2B0, [track2B + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load2B1, [track2B + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load2B2, [track2B + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load2B3, [track2B + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load3B0, [track3B + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load3B1, [track3B + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load3B2, [track3B + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load3B3, [track3B + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:6:-:1 @P6 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:6:-:1 @P6 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:6:-:1 @P6 LDG.E.CI loadA3, [trackA + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n", + j9c6 => "--:5:-:-:1 \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n", + j11c6 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<0*32>], loadA0;\n", + j11c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], loadA1;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], loadA2;\n", + j11c12 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<3*32>], loadA3;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0B0.CC, track0B0, ldb16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0B1, track0B1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, ldb16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2B0.CC, track2B0, ldb16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2B1, track2B1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3B0.CC, track3B0, ldb16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3B1, track3B1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackA0.CC, trackA0, 4x<16>;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackA1, trackA1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.128 load0B, [track0B];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.128 load1B, [track1B];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.128 load2B, [track2B];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.128 load3B, [track3B];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.128 loadA, [trackA];\n", + ) : + ( + + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI load2B0, [track2B + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E.CI load2B1, [track2B + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E.CI load2B2, [track2B + 4x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E.CI load2B3, [track2B + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_rnn_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_rnn_128x32.sass new file mode 100644 index 0000000..21b493d --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_nn_rnn_128x32.sass @@ -0,0 +1,512 @@ +# Kernel: sgemm_nn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + 32*16*2> + szShareA : (128*16 + 32) + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_bias[0] : c[0x0][0x158] + param_bias[1] : c[0x0][0x15c] + param_lockAddr[0] : c[0x0][0x160] + param_lockAddr[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_xcutoff : c[0x0][0x170] + param_flags : c[0x0][0x174] + param_lda : c[0x0][0x178] + param_ldb8 : c[0x0][0x17c] + param_ldc : c[0x0][0x180] + param_m : c[0x0][0x184] + param_n : c[0x0][0x188] + param_k : c[0x0][0x18c] + param_ldaz : c[0x0][0x190] + param_ldbz : c[0x0][0x194] + param_ldcz : c[0x0][0x198] + param_loops : c[0x0][0x19c] + param_dimB : c[0x0][0x1a0] + param_dimC : c[0x0][0x1a4] + param_unrolling : c[0x0][0x1a8] + param_numBlks : c[0x0][0x1ac] + param_numAblks : c[0x0][0x1b0] + + + + + 32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, offsetB, shiftAX + 80-81 : baseB<0-1> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, time_step + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-61 : bias00y<0-1>, bias04y<0-1>, bias08y<0-1>, bias12y<0-1>, b0, b1, b2, b3, baseC<0-1> + 62-66 : blkId, nextBlk, lockAddr<0-1>, lockVal + 67-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags, xcutoff, offsetC, numBlk + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; + +--:-:-:-:1 MOV time_step, RZ; +--:-:-:-:1 MOV flags, param_flags; + +RNN_LOOP: + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb16, ldb, 6; +--:-:-:-:1 SHL lda32, lda, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +--:-:-:-:6 LOP.AND.NZ P0, RZ, flags, 4; +--:-:-:-:6 @P0 IADD offsetB, -time_step, param_unrolling; +--:-:-:-:6 @P0 IADD offsetB, offsetB, -1; +--:-:-:-:6 @!P0 MOV offsetB, time_step; + +// baseB = param_B + dimB * time_step +--:-:-:-:1 XMAD offsetB, offsetB, param_dimB, RZ; +--:-:-:-:1 LEA baseB0.CC, offsetB, param_B[0], 2; +--:-:-:-:1 LEA.HI.X baseB1, offsetB, param_B[1], RZ, 2; + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa00, blkA, tidAX, 7; +--:-:-:-:1 IADD txa32, txa00, 32; +--:-:-:-:1 IADD txa64, txa00, 64; +--:-:-:-:1 IADD txa96, txa00, 96; + +--:-:-:-:1 XMAD.LO ta00, lda, txa00, tidAY, xmad_ta; +--:-:-:-:1 XMAD.LO2 ta00, ldaz, RZ, ta00; +--:-:-:-:1 IADD ta32, ta00, lda32; +--:-:-:-:1 IADD ta64, ta32, lda32; +--:-:-:-:1 IADD ta96, ta64, lda32; + +--:-:-:-:1 LEA track0A0.CC, ta00, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta00, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, ta32, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track1A1, ta32, param_A[1], RZ, 2; +--:-:-:-:1 LEA track2A0.CC, ta64, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track2A1, ta64, param_A[1], RZ, 2; +--:-:-:-:1 LEA track3A0.CC, ta96, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track3A1, ta96, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +--:-:-:-:1 XMAD.LO2 tb, ldbz, RZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, baseB0, 2; +--:-:-:-:1 LEA.HI.X trackB1, tb, baseB1, RZ, 2; + +// writeAs = (tidAY*128 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.128 load0A, [track0A]; +--:-:2:-:1 @P3 LDG.E.128 load1A, [track1A]; +--:-:3:-:1 @P4 LDG.E.128 load2A, [track2A]; +--:-:4:-:1 @P5 LDG.E.128 load3A, [track3A]; +--:-:5:-:1 @P6 LDG.E.128 loadB, [trackB]; + + + +--:-:6:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load1A, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 load2A, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 load3A, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.128 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:1:-:1 @P0 LDG.E load0A0, [track0A + 4x<0>]; +--:-:1:-:1 @P1 LDG.E load0A1, [track0A + 4x<1>]; +--:-:1:-:1 @P2 LDG.E load0A2, [track0A + 4x<2>]; +--:-:1:-:1 @P3 LDG.E load0A3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:2:-:1 @P0 LDG.E load1A0, [track1A + 4x<0>]; +--:-:2:-:1 @P1 LDG.E load1A1, [track1A + 4x<1>]; +--:-:2:-:1 @P2 LDG.E load1A2, [track1A + 4x<2>]; +--:-:2:-:1 @P3 LDG.E load1A3, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:3:-:1 @P0 LDG.E load2A0, [track2A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E load2A1, [track2A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E load2A2, [track2A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E load2A3, [track2A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:4:-:1 @P0 LDG.E load3A0, [track3A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E load3A1, [track3A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E load3A2, [track3A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E load3A3, [track3A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:5:-:1 @P0 LDG.E loadB0, [trackB + 4x<0>]; +--:-:5:-:1 @P1 LDG.E loadB1, [trackB + 4x<1>]; +--:-:5:-:1 @P2 LDG.E loadB2, [trackB + 4x<2>]; +--:-:5:-:1 @P3 LDG.E loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P1, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + + + +21:-:-:-:1 STS [writeAs + 4x<0*128 + 0*32>], load0A0; +--:-:-:-:0 IADD track0A0.CC, track0A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 0*32>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 0*32>], load0A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 0*32>], load0A3; + +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128 + 1*32>], load1A0; +--:-:-:-:0 IADD track1A0.CC, track1A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 1*32>], load1A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 1*32>], load1A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 1*32>], load1A3; + +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 2*32>], load2A0; +--:-:-:-:0 IADD track2A0.CC, track2A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 2*32>], load2A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 2*32>], load2A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 2*32>], load2A3; + +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 3*32>], load3A0; +--:-:-:-:0 IADD track3A0.CC, track3A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 3*32>], load3A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 3*32>], load3A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 3*32>], load3A3; + +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.128 load0A, [track0A]; +--:-:4:-:1 @P3 LDG.E.128 load1A, [track1A]; +--:-:5:-:1 @P4 LDG.E.128 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.128 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P2 LDG.E load0A0, [track0A + 4x<0>]; +--:-:3:-:1 @P2 LDG.E load0A1, [track0A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E load0A2, [track0A + 4x<2>]; +--:-:3:-:1 @P2 LDG.E load0A3, [track0A + 4x<3>]; + +--:-:4:-:1 @P3 LDG.E load1A0, [track1A + 4x<0>]; +--:-:4:-:1 @P3 LDG.E load1A1, [track1A + 4x<1>]; +--:-:4:-:1 @P3 LDG.E load1A2, [track1A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E load1A3, [track1A + 4x<3>]; + +--:-:5:-:1 @P4 LDG.E load2A0, [track2A + 4x<0>]; +--:-:5:-:1 @P4 LDG.E load2A1, [track2A + 4x<1>]; +--:-:5:-:1 @P4 LDG.E load2A2, [track2A + 4x<2>]; +--:-:5:-:1 @P4 LDG.E load2A3, [track2A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E load3A0, [track3A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E load3A1, [track3A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E load3A2, [track3A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E load3A3, [track3A + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P6 LDG.E loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P6 LDG.E loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P6 LDG.E loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n", + + j9c6 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n", + + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 4x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 4x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2A0.CC, track2A0, 4x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, 4x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.128 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.128 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P4 LDG.E.128 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.128 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E load0A0, [track0A + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E load0A1, [track0A + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E load0A2, [track0A + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E load0A3, [track0A + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E load1A0, [track1A + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E load1A1, [track1A + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E load1A2, [track1A + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E load1A3, [track1A + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P4 LDG.E load2A0, [track2A + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P4 LDG.E load2A1, [track2A + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P4 LDG.E load2A2, [track2A + 4x<2>];\n", + j10c3 => "--:-:5:-:1 \@P4 LDG.E load2A3, [track2A + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E load3A0, [track3A + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E load3A1, [track3A + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E load3A2, [track3A + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E load3A3, [track3A + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E loadB0, [trackB + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E loadB1, [trackB + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E loadB2, [trackB + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_nt_128x128.sass b/Kernel/SGEMM/Maxwell/sgemm_nt_128x128.sass new file mode 100644 index 0000000..e01b4b5 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_nt_128x128.sass @@ -0,0 +1,339 @@ +# Kernel: sgemm_nt_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ blkA, blkB, blkZ, tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, tid127, txa, txb, xmad_ta, xmad_tb, tid128 + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ k1, k2, k3 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-111 : loadA<0-7>, loadB<0-7> + 112-115 : trackA<0-1>, trackB<0-1> + + 116-121 ~ writeS, k, tidY, ta, tb, loop + 122-127 ~ readAs, readBs, tid + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 LOP.AND tid1, tid, 1; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); + +--:-:-:-:1 MOV loop, RZ; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// tidY = tid1 << 2 +--:-:-:-:1 SHL tidY, tid1, 2; + +// tidX = tid >> 1 +01:-:-:-:1 SHR.U32 tidX, tid, 1; + +// trackA += 4 * ((blkA*128 + tidX) * lda + tidY) +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x2; + +// trackB += 4 * ((blkB*128 + tidX) * ldb + tidY) +04:-:-:-:1 ISCADD txb, blkB, tidX, 7; +--:-:-:-:1 XMAD.LO tb, ldb, txb, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = 4 * (128 * tidY + tidX) +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 SHL writeS, writeS, 2; + +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<128*8*2>; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + + + our $vec; + return $vec ? q{ + +// k must be multiple of 8 +--:-:2:-:1 @P5 LDG.E.CI.128 loadA0, [trackA + 4x<0>]; +--:-:2:-:1 @P5 LDG.E.CI.128 loadA4, [trackA + 4x<8>]; + +--:-:3:-:1 @P6 LDG.E.CI.128 loadB0, [trackB + 4x<0>]; +--:5:4:-:1 @P6 LDG.E.CI.128 loadB4, [trackB + 4x<8>]; + +--:-:-:-:1 @!P5 LDS.U.128 loadA0, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.128 loadB0, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.128 loadA4, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.128 loadB4, [addr_zero]; + +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +22:-:-:-:1 STS [writeS + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeS + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeS + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeS + 4x<3*128>], loadA3; + +--:-:-:-:6 IADD trackA0.CC, trackA0, 4x<16>; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +24:-:-:-:1 STS [writeS + 4x< 8*128>], loadB0; +--:-:-:-:1 STS [writeS + 4x< 9*128>], loadB1; +--:-:-:-:1 STS [writeS + 4x<10*128>], loadB2; +--:-:-:-:1 STS [writeS + 4x<11*128>], loadB3; + +10:-:-:-:6 IADD trackB0.CC, trackB0, 4x<16>; +--:-:-:-:1 IADD.X trackB1, trackB1, RZ; + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<128*8*2>; + + } : q{ + + +--:-:-:-:1 IADD k1, tidY, 1; +--:-:-:-:1 IADD k2, tidY, 2; +--:-:-:-:1 IADD k3, tidY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P6; + +--:-:3:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 7; + +02:-:-:-:1 STS [writeS + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeS + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeS + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeS + 4x<3*128>], loadA3; + +--:-:-:-:6 IADD trackA0.CC, trackA0, 4x<8>; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +04:-:-:-:1 STS [writeS + 4x< 8*128>], loadB0; +--:-:-:-:1 STS [writeS + 4x< 9*128>], loadB1; +--:-:-:-:1 STS [writeS + 4x<10*128>], loadB2; +--:-:-:-:1 STS [writeS + 4x<11*128>], loadB3; + +--:-:-:-:6 IADD trackB0.CC, trackB0, 4x<8>; +--:-:-:-:1 IADD.X trackB1, trackB1, RZ; + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<128*8*2>; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; + }; + + + + our $vec; + our $vec; + our @top = $vec ? + ("--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n") : + ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, 16, P5;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + j0c13 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P5;\n", + j0c14 => "--:-:-:-:1 PSETP.AND.AND P3, PT, P0, P1, P6;\n", + + j0c27 => "--:-:2:-:1 \@P2 LDG.E.CI.128 loadA0, [trackA + 4x<0>];\n", + j0c29 => "--:-:2:-:1 \@P2 LDG.E.CI.128 loadA4, [trackA + 4x<8>];\n", + + j0c31 => "--:-:3:-:1 \@P3 LDG.E.CI.128 loadB0, [trackB + 4x<0>];\n", + j0c33 => "08:5:4:-:1 \@P3 LDG.E.CI.128 loadB4, [trackB + 4x<8>];\n", + + j3c29 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<0*128>], loadA4;\n", + j3c31 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<1*128>], loadA5;\n", + j3c33 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<2*128>], loadA6;\n", + j3c35 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<3*128>], loadA7;\n", + + j4c29 => "--:-:-:-:1 \@!P1 STS [writeS + 4x< 8*128>], loadB4;\n", + j4c31 => "--:-:-:-:1 \@!P1 STS [writeS + 4x< 9*128>], loadB5;\n", + j4c33 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<10*128>], loadB6;\n", + j4c35 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<11*128>], loadB7;\n", + + j5c29 => "02:-:-:-:1 \@P1 STS [writeS + 4x<0*128>], loadA0;\n", + j5c31 => "--:-:-:-:1 \@P1 STS [writeS + 4x<1*128>], loadA1;\n", + j5c33 => "--:-:-:-:1 \@P1 STS [writeS + 4x<2*128>], loadA2;\n", + j5c35 => "--:-:-:-:1 \@P1 STS [writeS + 4x<3*128>], loadA3;\n", + + j6c29 => "04:-:-:-:1 \@P1 STS [writeS + 4x< 8*128>], loadB0;\n", + j6c31 => "--:-:-:-:1 \@P1 STS [writeS + 4x< 9*128>], loadB1;\n", + j6c33 => "--:-:-:-:1 \@P1 STS [writeS + 4x<10*128>], loadB2;\n", + j6c35 => "--:2:-:-:1 \@P1 STS [writeS + 4x<11*128>], loadB3;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 4x<16>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c46 => "10:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, 4x<16>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 16, P6;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j0c29 => "--:-:2:-:1 \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j0c31 => "--:-:2:-:1 \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j0c33 => "--:-:2:-:1 \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + + j0c35 => "--:-:3:-:1 \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j1c27 => "--:-:3:-:1 \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j1c29 => "--:-:3:-:1 \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j1c31 => "--:-:3:-:1 \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + + j5c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x<0*128>], loadA0;\n", + j5c31 => "--:-:-:-:1 \@P0 STS [writeS + 4x<1*128>], loadA1;\n", + j5c33 => "--:-:-:-:1 \@P0 STS [writeS + 4x<2*128>], loadA2;\n", + j5c35 => "--:-:-:-:1 \@P0 STS [writeS + 4x<3*128>], loadA3;\n", + + j6c29 => "04:-:-:-:1 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n", + j6c31 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n", + j6c33 => "--:-:-:-:1 \@P0 STS [writeS + 4x<10*128>], loadB2;\n", + j6c35 => "--:2:-:-:1 \@P0 STS [writeS + 4x<11*128>], loadB3;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c46 => "--:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, 4x<8>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ), + ), + + j6c63 => "02:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" . + "--:-:-:-:1 IADD32I k, k, -8;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_nt_32x128.sass b/Kernel/SGEMM/Maxwell/sgemm_nt_32x128.sass new file mode 100644 index 0000000..339c825 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_nt_32x128.sass @@ -0,0 +1,483 @@ +# Kernel: sgemm_nt_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + (32*16 + 32)*2> + szShareA : (32*16 + 32) + szShareB : (128*16 + 32) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ tidX, lda, ldb, ldaz, ldbz, ldb32, tid1, tid3, tid96, ta, tb00, tb32, tb64, tb96, xmad_ta, xmad_tb, shiftX, tidY<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadA<0-3> + 84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + + 100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 110-120 ~ writeAs, writeBs, k, tidY, txa, txb00, txb32, txb64, txb96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; +--:-:3:-:1 S2R blkA, SR_CTAID.Y; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb32, ldb, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidX = tid >> 2 +// tidY = (tid & 3) << 2 +// shiftX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidY, tid3, 2; +--:-:-:-:1 SHL shiftX, tid3, 3; + +// trackA += ((blkA*32 + tidX) * lda + tidAY) * 4 +04:-:-:-:1 ISCADD txa, blkA, tidX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 2; + +// trackB += ((blkB*128 + tidX) * ldb + tidY) * 4 +02:-:-:-:1 ISCADD txb00, blkB, tidX, 7; +--:-:-:-:1 IADD txb32, txb00, 32; +--:-:-:-:1 IADD txb64, txb00, 64; +--:-:-:-:1 IADD txb96, txb00, 96; + +--:-:-:-:1 XMAD.LO tb00, ldb, txb00, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb00, ldbz, blkZ, tb00; +--:-:-:-:1 IADD tb32, tb00, ldb32; +--:-:-:-:1 IADD tb64, tb32, ldb32; +--:-:-:-:1 IADD tb96, tb64, ldb32; + +--:-:-:-:1 LEA track0B0.CC, tb00, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track0B1, tb00, param_B[1], RZ, 2; +--:-:-:-:1 LEA track1B0.CC, tb32, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track1B1, tb32, param_B[1], RZ, 2; +--:-:-:-:1 LEA track2B0.CC, tb64, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track2B1, tb64, param_B[1], RZ, 2; +--:-:-:-:1 LEA track3B0.CC, tb96, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track3B1, tb96, param_B[1], RZ, 2; + +// writeAs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 7; +--:-:-:-:1 IADD writeBs, writeBs, shiftX; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readBs, readBs, tid96; +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb64, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb96, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.CI.128 load0B, [track0B]; +--:-:2:-:1 @P3 LDG.E.CI.128 load1B, [track1B]; +--:-:3:-:1 @P4 LDG.E.CI.128 load2B, [track2B]; +--:-:4:-:1 @P5 LDG.E.CI.128 load3B, [track3B]; +--:-:5:-:1 @P6 LDG.E.CI.128 loadA, [trackA]; + + + +--:-:6:-:1 @!P2 LDS.U.128 load0B, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load1B, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 load2B, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 load3B, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.128 loadA, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidY1, tidY, 1; +--:-:-:-:1 IADD tidY2, tidY, 2; +--:-:-:-:1 IADD tidY3, tidY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P4; + +--:-:1:-:1 @P0 LDG.E.CI load0B0, [track0B + 4x<0>]; +--:-:1:-:1 @P1 LDG.E.CI load0B1, [track0B + 4x<1>]; +--:-:1:-:1 @P2 LDG.E.CI load0B2, [track0B + 4x<2>]; +--:-:1:-:1 @P3 LDG.E.CI load0B3, [track0B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI load1B0, [track1B + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI load1B1, [track1B + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI load1B2, [track1B + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI load1B3, [track1B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb64, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI load2B0, [track2B + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI load2B1, [track2B + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load2B2, [track2B + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI load2B3, [track2B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb96, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI load3B0, [track3B + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI load3B1, [track3B + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI load3B2, [track3B + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load3B3, [track3B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P6; + +--:-:5:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:5:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:5:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb32, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P0, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P0; + + + +21:-:-:-:1 STS [writeBs + 4x<0*128 + 0*32>], load0B0; +--:-:-:-:0 IADD track0B0.CC, track0B0, 4x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 0*32>], load0B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 0*32>], load0B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 0*32>], load0B3; + +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +02:-:-:-:1 STS [writeBs + 4x<0*128 + 1*32>], load1B0; +--:-:-:-:0 IADD track1B0.CC, track1B0, 4x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 1*32>], load1B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 1*32>], load1B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 1*32>], load1B3; + +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +04:-:-:-:1 STS [writeBs + 4x<0*128 + 2*32>], load2B0; +--:-:-:-:0 IADD track2B0.CC, track2B0, 4x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 2*32>], load2B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 2*32>], load2B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 2*32>], load2B3; + +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +08:-:-:-:1 STS [writeBs + 4x<0*128 + 3*32>], load3B0; +--:-:-:-:0 IADD track3B0.CC, track3B0, 4x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 3*32>], load3B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 3*32>], load3B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 3*32>], load3B3; + +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +10:-:-:-:1 STS [writeAs + 4x<0*32>], loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*32>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*32>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*32>], loadA3; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.128 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.128 load1B, [track1B]; +--:-:5:-:1 @P4 LDG.E.CI.128 load2B, [track2B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load3B, [track3B]; +--:-:6:-:1 @P6 LDG.E.CI.128 loadA, [trackA]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI load0B0, [track0B + 4x<0>]; +--:-:3:-:1 @P2 LDG.E.CI load0B1, [track0B + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load0B2, [track0B + 4x<2>]; +--:-:3:-:1 @P2 LDG.E.CI load0B3, [track0B + 4x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI load1B0, [track1B + 4x<0>]; +--:-:4:-:1 @P3 LDG.E.CI load1B1, [track1B + 4x<1>]; +--:-:4:-:1 @P3 LDG.E.CI load1B2, [track1B + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load1B3, [track1B + 4x<3>]; + +--:-:5:-:1 @P4 LDG.E.CI load2B0, [track2B + 4x<0>]; +--:-:5:-:1 @P4 LDG.E.CI load2B1, [track2B + 4x<1>]; +--:-:5:-:1 @P4 LDG.E.CI load2B2, [track2B + 4x<2>]; +--:-:5:-:1 @P4 LDG.E.CI load2B3, [track2B + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load3B0, [track3B + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load3B1, [track3B + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load3B2, [track3B + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load3B3, [track3B + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:6:-:1 @P6 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:6:-:1 @P6 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:6:-:1 @P6 LDG.E.CI loadA3, [trackA + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 1; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 0*32>], load0B0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 0*32>], load0B1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 0*32>], load0B2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 0*32>], load0B3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 1*32>], load1B0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 1*32>], load1B1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 1*32>], load1B2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 1*32>], load1B3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 2*32>], load2B0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 2*32>], load2B1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 2*32>], load2B2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 2*32>], load2B3;\n", + + j9c6 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 3*32>], load3B0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 3*32>], load3B1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 3*32>], load3B2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 3*32>], load3B3;\n", + + j11c6 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<0*32>], loadA0;\n", + j11c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], loadA1;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], loadA2;\n", + j11c12 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<3*32>], loadA3;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0B0.CC, track0B0, 4x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0B1, track0B1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, 4x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2B0.CC, track2B0, 4x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2B1, track2B1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3B0.CC, track3B0, 4x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3B1, track3B1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackA0.CC, trackA0, 4x<16>;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackA1, trackA1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.128 load0B, [track0B];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.128 load1B, [track1B];\n", + j9c29 => "10:-:5:-:1 \@P4 LDG.E.CI.128 load2B, [track2B];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.128 load3B, [track3B];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.128 loadA, [trackA];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI load2B0, [track2B + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P4 LDG.E.CI load2B1, [track2B + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P4 LDG.E.CI load2B2, [track2B + 4x<2>];\n", + j10c3 => "--:-:-:-:1 \@P4 LDG.E.CI load2B3, [track2B + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_rnn_bprop_common_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_rnn_bprop_common_128x32.sass new file mode 100644 index 0000000..9f5919a --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_rnn_bprop_common_128x32.sass @@ -0,0 +1,362 @@ +# sgemm_common_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; +--:-:-:-:5 MOV xcutoff, param_xcutoff; + +--:-:-:-:6 LOP.AND.NZ P0, RZ, flags, 4; +--:-:-:-:6 @P0 IADD offsetC, -time_step, param_unrolling; +--:-:-:-:6 @P0 IADD offsetC, offsetC, -1; +--:-:-:-:6 @!P0 MOV offsetC, time_step; + +// baseH = param_H + dimH * time_step +--:-:-:-:1 XMAD offsetH, offsetC, param_dimH, RZ; +--:-:-:-:1 LEA baseH0.CC, offsetH, param_H[0], 2; +--:-:-:-:1 LEA.HI.X baseH1, offsetH, param_H[1], RZ, 2; + +// baseC = param_C + dimC * time_step +--:-:-:-:1 XMAD offsetC, offsetC, param_dimC, RZ; +--:-:-:-:1 LEA baseC0.CC, offsetC, param_C[0], 2; +--:-:-:-:1 LEA.HI.X baseC1, offsetC, param_C[1], RZ, 2; + +// writeCs = (readAs / 4) * 32 + readBs; +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 2; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +--:-:-:-:1 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, RZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, baseC0, 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, baseC1, RZ, 2; + +// Apply relu +--:-:-:-:0 LOP.AND.NZ P4, RZ, flags, 2; +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + +--:-:-:-:1 MOV ldh1, param_ldh; + +// H += (ldh*cy + cx) * 4 +--:-:-:-:1 XMAD.LO ci, ldh1, cy00, cx, xmad_c; +--:-:-:-:1 LEA H00y0.CC, ci, baseH0, 2; +--:-:-:-:1 LEA.HI.X H00y1, ci, baseH1, RZ, 2; + +--:-:-:-:1 SHL ldh1, ldh1, 2; +--:-:-:-:1 SHL ldh4, ldh1, 2; +--:-:-:-:1 SHL ldh60, ldh1, 6; +--:-:-:-:1 IADD ldh60, ldh60, -ldh4; + + +--:-:-:-:4 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 MOV d0, RZ; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:4 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 MOV d1, RZ; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:3 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 MOV d2, RZ; +--:-:-:-:1 MOV d3, RZ; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:1 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:6 IADD H04y0.CC, H00y0, ldh4; +--:-:-:-:1 IADD.X H04y1, H00y1, RZ; +--:-:-:-:6 IADD H08y0.CC, H04y0, ldh4; +--:-:-:-:1 IADD.X H08y1, H04y1, RZ; +--:-:-:-:6 IADD H12y0.CC, H08y0, ldh4; +--:-:-:-:0 IADD.X H12y1, H08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" . + "--:-:-:-:6 IADD H00y0.CC, H00y0, ldh60;\n" . + "--:-:-:-:1 IADD.X H00y1, H00y1, RZ;\n" . + "--:-:-:-:6 IADD H04y0.CC, H04y0, ldh60;\n" . + "--:-:-:-:1 IADD.X H04y1, H04y1, RZ;\n" . + "--:-:-:-:6 IADD H08y0.CC, H08y0, ldh60;\n" . + "--:-:-:-:1 IADD.X H08y1, H08y1, RZ;\n" . + "--:-:-:-:6 IADD H12y0.CC, H12y0, ldh60;\n" . + "--:-:-:-:1 IADD.X H12y1, H12y1, RZ;\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:1 MOV lockAddr0, param_lockAddr[0]; +--:-:-:-:1 MOV lockAddr1, param_lockAddr[1]; + +// time_step = time_step + 1 +--:-:-:-:6 IADD time_step, time_step, 1; +--:-:-:-:1 ISETP.LT.AND P0, PT, time_step, param_unrolling, PT; + +// Synchronize all blocks +--:-:-:-:1 ISETP.NE.AND P1, PT, tid, RZ, PT; +--:-:-:-:6 XMAD blkId, blkB, param_numAblks, blkA; +--:-:-:-:6 IADD nextBlk, blkId, 1; +--:-:-:-:8 ISETP.EQ.OR P2, PT, nextBlk, param_numBlks, P1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 SSY SSY_TARGET1; +--:-:-:-:d @P1 SYNC; +--:-:-:-:6 @P2 MOV nextBlk, RZ; + +SPINLOCK1: +--:-:1:Y:2 ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk; +01:-:-:Y:d ISETP.NE.AND P1, PT, lockVal, blkId, PT; +--:-:-:-:d @P1 BRA.U SPINLOCK1; +--:-:-:-:d SYNC; + +SSY_TARGET1: +--:-:-:-:1 SSY SSY_TARGET2; +--:-:-:-:d @P2 SYNC; +--:-:-:-:6 MOV nextBlk, RZ; + +SPINLOCK2: +--:-:1:Y:2 ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk; +01:-:-:Y:d ISETP.NE.AND P1, PT, lockVal, RZ, PT; +--:-:-:-:5 @P1 BRA.U SPINLOCK2; +--:-:-:-:d SYNC; + +SSY_TARGET2: +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:f MEMBAR.GL; + +//Loop back to beginning of GEMM loop +--:-:-:Y:5 @P0 BRA.U RNN_LOOP; + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 LDG.E h0, [H00y]; +--:-:-:-:1 LDG.E h1, [H04y]; +--:-:-:-:1 LDG.E h2, [H08y]; +--:-:-:-:1 LDG.E h3, [H12y]; + + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y]; +--:-:2:-:1 @P1 LDG.E d1, [C04y]; +--:-:3:-:1 @P2 LDG.E d2, [C08y]; +--:-:4:-:1 @P3 LDG.E d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:3 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*32>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*32>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*32>]; + + +--:-:-:-:1 P2R predSave, PR, RZ, 0x0f; + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, d3, beta, c3; + +//Bprop for activation: Rectlinclip + +--:-:-:-:1 FSETP.LT.AND P0, PT, RZ, h0, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, RZ, h1, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, RZ, h2, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, RZ, h3, PT; +--:-:-:-:1 FSETP.LT.AND P0, PT, h0, xcutoff, P0; +--:-:-:-:1 FSETP.LT.AND P1, PT, h1, xcutoff, P1; +--:-:-:-:1 FSETP.LT.AND P2, PT, h2, xcutoff, P2; +--:-:-:-:1 FSETP.LT.AND P3, PT, h3, xcutoff, P3; +--:-:-:-:1 SEL c0, c0, RZ, P0; +--:-:-:-:1 SEL c1, c1, RZ, P1; +--:-:-:-:1 SEL c2, c2, RZ, P2; +--:-:-:-:1 SEL c3, c3, RZ, P3; + + +--:-:-:Y:d R2P PR, predSave, 0x0f; + +--:1:-:-:1 @P0 STG.E [C00y], c0; +--:2:-:-:1 @P1 STG.E [C04y], c1; +--:3:-:-:1 @P2 STG.E [C08y], c2; +--:4:-:-:1 @P3 STG.E [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:1 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:6 IADD H00y0.CC, H00y0, ldh1; +--:-:-:-:1 IADD.X H00y1, H00y1, RZ; +--:-:-:-:6 IADD H04y0.CC, H04y0, ldh1; +--:-:-:-:1 IADD.X H04y1, H04y1, RZ; +--:-:-:-:6 IADD H08y0.CC, H08y0, ldh1; +--:-:-:-:1 IADD.X H08y1, H08y1, RZ; +--:-:-:-:6 IADD H12y0.CC, H12y0, ldh1; +--:-:-:-:0 IADD.X H12y1, H12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/sgemm_rnn_common_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_rnn_common_128x32.sass new file mode 100644 index 0000000..67bda6f --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_rnn_common_128x32.sass @@ -0,0 +1,348 @@ +# sgemm_common_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; +--:-:-:-:5 MOV xcutoff, param_xcutoff; + +--:-:-:-:6 LOP.AND.NZ P0, RZ, flags, 4; +--:-:-:-:6 @P0 IADD offsetC, -time_step, param_unrolling; +--:-:-:-:6 @P0 IADD offsetC, offsetC, -1; +--:-:-:-:6 @!P0 MOV offsetC, time_step; + +// baseC = param_C + dimC * time_step +--:-:-:-:1 XMAD offsetC, offsetC, param_dimC, RZ; +--:-:-:-:1 LEA baseC0.CC, offsetC, param_C[0], 2; +--:-:-:-:1 LEA.HI.X baseC1, offsetC, param_C[1], RZ, 2; + +// writeCs = (readAs / 4) * 32 + readBs; +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 2; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +--:-:-:-:1 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, RZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, baseC0, 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, baseC1, RZ, 2; + +// Apply relu +--:-:-:-:0 LOP.AND.NZ P4, RZ, flags, 2; +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + + +--:-:-:-:4 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 MOV d0, RZ; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:4 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 MOV d1, RZ; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:3 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 MOV d2, RZ; +--:-:-:-:1 MOV d3, RZ; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:1 IADD.X C12y1, C08y1, RZ; + + +// bias_track = bias + cy +--:-:-:-:1 LEA bias00y0.CC, cy00, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X bias00y1, cy00, param_bias[1], RZ, 2; +--:-:-:-:1 LEA bias04y0.CC, cy04, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X bias04y1, cy04, param_bias[1], RZ, 2; +--:-:-:-:1 LEA bias08y0.CC, cy08, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X bias08y1, cy08, param_bias[1], RZ, 2; +--:-:-:-:1 LEA bias12y0.CC, cy12, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X bias12y1, cy12, param_bias[1], RZ, 2; + + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" . + "--:-:-:-:6 IADD bias00y0.CC, bias00y0, 240;\n" . + "--:-:-:-:1 IADD.X bias00y1, bias00y1, RZ;\n" . + "--:-:-:-:6 IADD bias04y0.CC, bias04y0, 240;\n" . + "--:-:-:-:1 IADD.X bias04y1, bias04y1, RZ;\n" . + "--:-:-:-:6 IADD bias08y0.CC, bias08y0, 240;\n" . + "--:-:-:-:1 IADD.X bias08y1, bias08y1, RZ;\n" . + "--:-:-:-:6 IADD bias12y0.CC, bias12y0, 240;\n" . + "--:-:-:-:1 IADD.X bias12y1, bias12y1, RZ;\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:1 MOV lockAddr0, param_lockAddr[0]; +--:-:-:-:1 MOV lockAddr1, param_lockAddr[1]; + +// time_step = time_step + 1 +--:-:-:-:6 IADD time_step, time_step, 1; +--:-:-:-:1 ISETP.LT.AND P0, PT, time_step, param_unrolling, PT; + +// Synchronize all blocks +--:-:-:-:1 ISETP.NE.AND P1, PT, tid, RZ, PT; +--:-:-:-:6 XMAD blkId, blkB, param_numAblks, blkA; +--:-:-:-:6 IADD nextBlk, blkId, 1; +--:-:-:-:8 ISETP.EQ.OR P2, PT, nextBlk, param_numBlks, P1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 SSY SSY_TARGET1; +--:-:-:-:d @P1 SYNC; +--:-:-:-:6 @P2 MOV nextBlk, RZ; + +SPINLOCK1: +--:-:1:Y:2 ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk; +01:-:-:Y:d ISETP.NE.AND P1, PT, lockVal, blkId, PT; +--:-:-:-:d @P1 BRA.U SPINLOCK1; +--:-:-:-:d SYNC; + +SSY_TARGET1: +--:-:-:-:1 SSY SSY_TARGET2; +--:-:-:-:d @P2 SYNC; +--:-:-:-:6 MOV nextBlk, RZ; + +SPINLOCK2: +--:-:1:Y:2 ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk; +01:-:-:Y:d ISETP.NE.AND P1, PT, lockVal, RZ, PT; +--:-:-:-:5 @P1 BRA.U SPINLOCK2; +--:-:-:-:d SYNC; + +SSY_TARGET2: +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:f MEMBAR.GL; + +//Loop back to beginning of GEMM loop +--:-:-:Y:5 @P0 BRA.U RNN_LOOP; + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 LDG.E.CI b0, [bias00y]; +--:-:-:-:1 LDG.E.CI b1, [bias04y]; +--:-:-:-:1 LDG.E.CI b2, [bias08y]; +--:-:-:-:1 LDG.E.CI b3, [bias12y]; + + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y]; +--:-:2:-:1 @P1 LDG.E d1, [C04y]; +--:-:3:-:1 @P2 LDG.E d2, [C08y]; +--:-:4:-:1 @P3 LDG.E d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:3 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*32>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*32>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*32>]; + + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, d3, beta, c3; + +--:-:-:-:1 FADD c0, c0, b0; +--:-:-:-:1 FADD c1, c1, b1; +--:-:-:-:1 FADD c2, c2, b2; +--:-:-:-:3 FADD c3, c3, b3; + +//Activation function: Rectlinclip + +--:-:-:-:1 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 FMNMX c2, c2, RZ, !PT; +--:-:-:-:3 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 FMNMX c0, c0, xcutoff, PT; +--:-:-:-:1 FMNMX c1, c1, xcutoff, PT; +--:-:-:-:1 FMNMX c2, c2, xcutoff, PT; +--:-:-:-:3 FMNMX c3, c3, xcutoff, PT; + + +--:1:-:-:1 @P0 STG.E [C00y], c0; +--:2:-:-:1 @P1 STG.E [C04y], c1; +--:3:-:-:1 @P2 STG.E [C08y], c2; +--:4:-:-:1 @P3 STG.E [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:1 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:6 IADD bias00y0.CC, bias00y0, 4; +--:-:-:-:1 IADD.X bias00y1, bias00y1, RZ; +--:-:-:-:6 IADD bias04y0.CC, bias04y0, 4; +--:-:-:-:1 IADD.X bias04y1, bias04y1, RZ; +--:-:-:-:6 IADD bias08y0.CC, bias08y0, 4; +--:-:-:-:1 IADD.X bias08y1, bias08y1, RZ; +--:-:-:-:6 IADD bias12y0.CC, bias12y0, 4; +--:-:-:-:0 IADD.X bias12y1, bias12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Maxwell/sgemm_tn_128x128.sass b/Kernel/SGEMM/Maxwell/sgemm_tn_128x128.sass new file mode 100644 index 0000000..5099001 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_tn_128x128.sass @@ -0,0 +1,279 @@ +# Kernel: sgemm_tn_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, tid1, tid7, tidX, blk, tid31, tid128 + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-95 ~ x<1-3>, y<1-3> + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-103 : loadA<0-3>, loadB<0-3> + + 104-107 : trackA<0-1>, trackB<0-1> + + 108-121 ~ writeS, lda8, k, tidY, txa, txb, ta, tb, loop + 122-127 ~ readAs, readBs, tid + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV loop, RZ; +--:-:-:-:1 STS.128 [addr_zero], RZ; + + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); + + +// tidX = (tid & 31) << 2 +// tidY = (tid >> 5) & 7 +01:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL tidX, tid31, 2; +--:-:-:-:1 BFE.U32 tidY, tid, 0x305; // 3 bits at position 5 + +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 5; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// trackA += (blkA*128 + lda*tidY + tidX) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// trackB += (blkB*128 + ldb*tidY + tidX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidX, 7; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 SHL writeS, writeS, 2; +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<128*8*2>; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:1 LOP.AND.NZ P4, RZ, k, 7; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 8, P4; + +// doLoad = tidY < k && txa|txb < n|m +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P6; + +--:-:2:-:1 @P2 LDG.E.CI.128 loadA, [trackA]; +--:-:3:-:1 @P3 LDG.E.CI.128 loadB, [trackB]; + +--:-:5:-:1 @!P2 LDS.U.128 loadA, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 loadB, [addr_zero]; + + // Vec 4 and scalar loads + } : q{ + +// doLoadA = tidY < k && txa < m +// doLoadB = tidY < k && txb < n +--:-:-:-:1 IADD x1, txa, 1; +--:-:-:-:1 IADD x2, txa, 2; +--:-:-:-:1 IADD x3, txa, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_m, P0; + +--:-:2:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 IADD y1, txb, 1; +--:-:-:-:1 IADD y2, txb, 2; +--:-:-:-:1 IADD y3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_n, P0; + +--:-:3:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 8, PT; + }; + + + + +12:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadA0; + +--:-:-:-:6 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +24:-:-:-:1 STS.128 [writeS + 4x<8*128>], loadB0; + +--:-:-:-:1 IADD trackB0.CC, trackB0, param_ldb8; + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<128*8*2>; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n"); + + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + ($vec ? + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI.128 loadA, [trackA];\n", + j0c13 => "--:-:3:-:1 \@P3 LDG.E.CI.128 loadB, [trackB];\n", + ) : + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j0c29 => "--:-:2:-:1 \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j0c31 => "--:-:2:-:1 \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j0c33 => "--:-:2:-:1 \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + + j0c35 => "--:-:3:-:1 \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j1c29 => "--:-:3:-:1 \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j1c31 => "--:-:3:-:1 \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j1c33 => "--:-:3:-:1 \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j5c33 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*128>], loadA0;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, param_lda8;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c33 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<8*128>], loadB0;\n", + + j6c46 => "--:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" . + "--:-:-:-:1 IADD32I k, k, -8;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_tn_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_tn_128x32.sass new file mode 100644 index 0000000..0b9ffc1 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_tn_128x32.sass @@ -0,0 +1,447 @@ +# Kernel: sgemm_tn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + 32*16*2> + szShareA : 128*16 + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 5; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL lda16, lda, 6; +--:-:-:-:1 SHL ldb16, ldb, 6; +--:-:-:-:1 SHL lda4, lda, 2; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = (tid & 31) << 2 +// tidAY = (tid >> 5) +01:-:-:-:1 LOP.AND tidAX, tid, 31; +--:-:-:-:1 SHL tidAX, tidAX, 2; +--:-:-:-:1 SHR.U32 tidAY, tid, 5; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += (blkA*128 + tidAX + lda*tidAY) * 4 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO2 ta0, lda, tidAY, txa; +08:-:-:-:1 XMAD.LO2 ta0, ldaz, blkZ, ta0; +--:-:-:-:1 IADD ta1, ta0, lda4; +--:-:-:-:1 IADD ta2, ta1, lda4; +--:-:-:-:1 IADD ta3, ta2, lda4; + +--:-:-:-:1 LEA track0A0.CC, ta0, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta0, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, ta1, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track1A1, ta1, param_A[1], RZ, 2; +--:-:-:-:1 LEA track2A0.CC, ta2, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track2A1, ta2, param_A[1], RZ, 2; +--:-:-:-:1 LEA track3A0.CC, ta3, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track3A1, ta3, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 2; + +// writeAs = (tidAY*128 + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidAY1, tidAY, 4; +--:-:-:-:1 IADD tidAY2, tidAY, 8; +--:-:-:-:1 IADD tidAY3, tidAY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.CI.128 load0A, [track0A]; +--:-:2:-:1 @P1 LDG.E.CI.128 load1A, [track1A]; +--:-:3:-:1 @P2 LDG.E.CI.128 load2A, [track2A]; +--:-:4:-:1 @P3 LDG.E.CI.128 load3A, [track3A]; +--:-:5:-:1 @P4 LDG.E.CI.128 loadB, [trackB]; + + + +--:-:6:-:1 @!P0 LDS.U.128 load0A, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 load1A, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.128 load2A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load3A, [addr_zero]; +--:-:6:-:2 @!P4 LDS.U.128 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD txa1, txa, 1; +--:-:-:-:1 IADD txa2, txa, 2; +--:-:-:-:1 IADD txa3, txa, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P4; + +--:-:1:-:1 @P0 LDG.E.CI load0A0, [track0A + 4x<0>]; +--:-:1:-:1 @P1 LDG.E.CI load0A1, [track0A + 4x<1>]; +--:-:1:-:1 @P2 LDG.E.CI load0A2, [track0A + 4x<2>]; +--:-:1:-:1 @P3 LDG.E.CI load0A3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:2:-:1 @P0 LDG.E.CI load1A0, [track1A + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI load1A1, [track1A + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI load1A2, [track1A + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI load1A3, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P6; + +--:-:3:-:1 @P0 LDG.E.CI load2A0, [track2A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI load2A1, [track2A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load2A2, [track2A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI load2A3, [track2A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:4:-:1 @P0 LDG.E.CI load3A0, [track3A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI load3A1, [track3A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI load3A2, [track3A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load3A3, [track3A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:5:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:5:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:5:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 15; + + + +21:-:-:-:1 STS.128 [writeAs + 4x<0*128>], load0A; +--:-:-:-:6 IADD track0A0.CC, track0A0, lda16; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS.128 [writeAs + 4x<4*128>], load1A; +--:-:-:-:6 IADD track1A0.CC, track1A0, lda16; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS.128 [writeAs + 4x<8*128>], load2A; +--:-:-:-:6 IADD track2A0.CC, track2A0, lda16; +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS.128 [writeAs + 4x<12*128>], load3A; +--:-:-:-:6 IADD track3A0.CC, track3A0, lda16; +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P5 LDG.E.CI.128 load0A, [track0A]; +--:-:4:-:1 @P5 LDG.E.CI.128 load1A, [track1A]; +--:-:5:-:1 @P5 LDG.E.CI.128 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.CI.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.128 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P5 LDG.E.CI load0A0, [track0A + 4x<0>]; +--:-:3:-:1 @P5 LDG.E.CI load0A1, [track0A + 4x<1>]; +--:-:3:-:1 @P5 LDG.E.CI load0A2, [track0A + 4x<2>]; +--:-:3:-:1 @P5 LDG.E.CI load0A3, [track0A + 4x<3>]; + +--:-:4:-:1 @P5 LDG.E.CI load1A0, [track1A + 4x<0>]; +--:-:4:-:1 @P5 LDG.E.CI load1A1, [track1A + 4x<1>]; +--:-:4:-:1 @P5 LDG.E.CI load1A2, [track1A + 4x<2>]; +--:-:4:-:1 @P5 LDG.E.CI load1A3, [track1A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load2A0, [track2A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load2A1, [track2A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load2A2, [track2A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load2A3, [track2A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load3A0, [track3A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load3A1, [track3A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load3A2, [track3A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load3A3, [track3A + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P6 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P6 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P6 LDG.E.CI loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 0; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n", + j9c6 => "--:5:-:-:1 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n", + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, lda16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, lda16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2A0.CC, track2A0, lda16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, lda16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.128 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.128 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.128 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI load2A0, [track2A + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E.CI load2A1, [track2A + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E.CI load2A2, [track2A + 4x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E.CI load2A3, [track2A + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_tn_128x64.sass b/Kernel/SGEMM/Maxwell/sgemm_tn_128x64.sass new file mode 100644 index 0000000..74f13cc --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_tn_128x64.sass @@ -0,0 +1,326 @@ +# Kernel: sgemm_tn_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 64*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ lda, ldb, ldaz, ldbz, tid1, ta, tb, tid7, tid15, tidX, blk, txa64, xmad_tb, tid, blkA, blkB, blkZ + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-95 ~ x<1-3>, x<65-67>, y<1-3> + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-107 : loadA<0-7>, loadB<0-3> + 108-111 : trackA<0-1>, trackB<0-1> + + 112-125 ~ writeAs, writeBs, k, tidY, txa, txb, swapBuf + 126-127 ~ readAs, readBs + + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-125 ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +--:-:-:-:1 LOP.AND tid1, tid, 1; +01:-:-:-:1 LOP.AND tid15, tid, 15; + +// tidX = (tid & 15) << 2 +// tidY = (tid >> 4) & 7 +--:-:-:-:1 SHL tidX, tid15, 2; +--:-:-:-:1 BFE.U32 tidY, tid, 0x304; // 3 bits at position 4 + +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 5; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + + +// trackA += (blkA*128 + lda*tidY + tidX + ldaz*blkZ) * 4 +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x2; + +--:-:-:-:1 IADD txa64, txa, 64; +--:-:-:-:1 ISETP.LT.AND P4, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa64, param_m, PT; + +// trackB += (blkB*64 + tidX + ldb*tidY + ldbz*blkZ) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// Start the write buffers high +// writeAs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2; +// writeBs = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2; + +// Start the read buffers low +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x<64*8 + 128*8>; + + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ + +// doLoad = tidY < k && txa|txb < n|m +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P6; + + +--:-:2:-:1 @P1 LDG.E.CI.128 loadA0, [trackA + 4x< 0>]; +--:-:3:-:1 @P2 LDG.E.CI.128 loadA4, [trackA + 4x<64>]; +--:-:4:-:1 @P3 LDG.E.CI.128 loadB0, [trackB]; + +--:-:5:-:2 @!P1 LDS.U.128 loadA0, [addr_zero]; +--:-:5:-:2 @!P2 LDS.U.128 loadA4, [addr_zero]; +--:-:6:-:2 @!P3 LDS.U.128 loadB0, [addr_zero]; + + +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:1 LOP.AND.NZ P1, RZ, k, 7; + + // Vec 4 and scalar loads + } : q{ + +// doLoadA = tidY < k && txa < m +// doLoadB = tidY < k && txb < n +--:-:-:-:1 IADD x1, txa, 1; +--:-:-:-:1 IADD x2, txa, 2; +--:-:-:-:1 IADD x3, txa, 3; +--:-:-:-:1 IADD x65, txa, 65; +--:-:-:-:1 IADD x66, txa, 66; +--:-:-:-:1 IADD x67, txa, 67; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_m, P0; + +--:-:2:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x65, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x66, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x67, param_m, P0; + +--:-:3:-:1 @P0 LDG.E.CI loadA4, [trackA + 4x<64>]; +--:-:3:-:1 @P1 LDG.E.CI loadA5, [trackA + 4x<65>]; +--:-:3:-:1 @P2 LDG.E.CI loadA6, [trackA + 4x<66>]; +--:-:3:-:1 @P3 LDG.E.CI loadA7, [trackA + 4x<67>]; + +--:-:-:-:1 @!P0 MOV loadA4, RZ; +--:-:-:-:1 @!P1 MOV loadA5, RZ; +--:-:-:-:1 @!P2 MOV loadA6, RZ; +--:-:-:-:1 @!P3 MOV loadA7, RZ; + +--:-:-:-:1 IADD y1, txb, 1; +--:-:-:-:1 IADD y2, txb, 2; +--:-:-:-:1 IADD y3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_n, P0; + +--:-:4:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + }; + + + + +12:-:-:-:1 STS.128 [writeAs + 4x< 0>], loadA0; +04:-:-:-:1 STS.128 [writeAs + 4x<64>], loadA4; + +--:-:-:-:6 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +28:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:1 IADD.X trackB1, trackB1, RZ; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + + + our $vec; + return $vec ? q{ +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; + } : q{ +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + }; + + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P4;\n"); + + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P5;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, P6;\n", + + ($vec ? + ( + j0c13 => "--:-:2:-:1 \@P2 LDG.E.CI.128 loadA0, [trackA + 4x< 0>];\n", + j0c15 => "--:-:3:-:1 \@P3 LDG.E.CI.128 loadA4, [trackA + 4x<64>];\n", + j0c33 => "--:-:4:-:1 \@P0 LDG.E.CI.128 loadB0, [trackB];\n", + ) : + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j0c12 => "--:-:2:-:1 \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j0c14 => "--:-:2:-:1 \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j0c16 => "--:-:2:-:1 \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + + j0c29 => "--:-:3:-:1 \@P3 LDG.E.CI loadA4, [trackA + 4x<64>];\n", + j0c31 => "--:-:3:-:1 \@P3 LDG.E.CI loadA5, [trackA + 4x<65>];\n", + j0c33 => "--:-:3:-:1 \@P3 LDG.E.CI loadA6, [trackA + 4x<66>];\n", + j0c35 => "--:-:3:-:1 \@P3 LDG.E.CI loadA7, [trackA + 4x<67>];\n", + + j1c29 => "--:-:4:-:1 \@P0 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j1c31 => "--:-:4:-:1 \@P0 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j1c33 => "--:-:4:-:1 \@P0 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j1c35 => "--:-:4:-:1 \@P0 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j1c37 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + j1c39 => "--:-:-:-:1 IADD32I k, k, -8;\n", + + j5c31 => "02:-:-:-:1 \@P0 STS.128 [writeAs + 4x< 0>], loadA0;\n", + j5c33 => "04:-:-:-:1 \@P0 STS.128 [writeAs + 4x<64>], loadA4;\n", + + j5c46 => "--:-:-:-:1 \@P0 IADD trackA0.CC, trackA0, param_lda8;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X trackA1, trackA1, RZ;\n", + + j6c39 => "08:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c46 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c54 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Maxwell/sgemm_tn_rnn_bprop_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_tn_rnn_bprop_128x32.sass new file mode 100644 index 0000000..3db4612 --- /dev/null +++ b/Kernel/SGEMM/Maxwell/sgemm_tn_rnn_bprop_128x32.sass @@ -0,0 +1,476 @@ +# Kernel: sgemm_tn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + 32*16*2> + szShareA : 128*16 + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_H[0] : c[0x0][0x158] + param_H[1] : c[0x0][0x15c] + param_lockAddr[0] : c[0x0][0x160] + param_lockAddr[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_xcutoff : c[0x0][0x170] + param_flags : c[0x0][0x174] + param_lda8 : c[0x0][0x178] + param_ldb8 : c[0x0][0x17c] + param_ldc : c[0x0][0x180] + param_ldh : c[0x0][0x184] + param_m : c[0x0][0x188] + param_n : c[0x0][0x18c] + param_k : c[0x0][0x190] + param_ldaz : c[0x0][0x194] + param_ldbz : c[0x0][0x198] + param_ldcz : c[0x0][0x19c] + param_loops : c[0x0][0x1a0] + param_dimB : c[0x0][0x1a4] + param_dimC : c[0x0][0x1a8] + param_dimH : c[0x0][0x1ac] + param_unrolling : c[0x0][0x1b0] + param_numBlks : c[0x0][0x1b4] + param_numAblks : c[0x0][0x1b8] + + + + + 32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>, offsetB + 80-81 : baseB<0-1> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, time_step + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-63 : H00y<0-1>, H04y<0-1>, H08y<0-1>, H12y<0-1>, h0, h1, h2, h3, baseC<0-1>, baseH<0-1> + 64-68 : blkId, nextBlk, lockAddr<0-1>, lockVal + 69-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags, xcutoff, offsetC, offsetH, numBlk, predSave, ldh1, ldh4, ldh60 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; + +--:-:-:-:1 MOV time_step, RZ; +--:-:-:-:1 MOV flags, param_flags; + +RNN_LOOP: + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 5; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL lda16, lda, 6; +--:-:-:-:1 SHL ldb16, ldb, 6; +--:-:-:-:1 SHL lda4, lda, 2; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +--:-:-:-:6 LOP.AND.NZ P0, RZ, flags, 4; +--:-:-:-:6 @P0 IADD offsetB, -time_step, param_unrolling; +--:-:-:-:6 @P0 IADD offsetB, offsetB, -1; +--:-:-:-:6 @!P0 MOV offsetB, time_step; + +// baseB = param_B + dimB * time_step +--:-:-:-:1 XMAD offsetB, offsetB, param_dimB, RZ; +--:-:-:-:1 LEA baseB0.CC, offsetB, param_B[0], 2; +--:-:-:-:1 LEA.HI.X baseB1, offsetB, param_B[1], RZ, 2; + +// tidAX = (tid & 31) << 2 +// tidAY = (tid >> 5) +01:-:-:-:1 LOP.AND tidAX, tid, 31; +--:-:-:-:1 SHL tidAX, tidAX, 2; +--:-:-:-:1 SHR.U32 tidAY, tid, 5; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += (blkA*128 + tidAX + lda*tidAY) * 4 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO2 ta0, lda, tidAY, txa; +08:-:-:-:1 XMAD.LO2 ta0, ldaz, RZ, ta0; +--:-:-:-:1 IADD ta1, ta0, lda4; +--:-:-:-:1 IADD ta2, ta1, lda4; +--:-:-:-:1 IADD ta3, ta2, lda4; + +--:-:-:-:1 LEA track0A0.CC, ta0, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta0, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, ta1, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track1A1, ta1, param_A[1], RZ, 2; +--:-:-:-:1 LEA track2A0.CC, ta2, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track2A1, ta2, param_A[1], RZ, 2; +--:-:-:-:1 LEA track3A0.CC, ta3, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track3A1, ta3, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, RZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, baseB0, 2; +--:-:-:-:1 LEA.HI.X trackB1, tb, baseB1, RZ, 2; + +// writeAs = (tidAY*128 + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidAY1, tidAY, 4; +--:-:-:-:1 IADD tidAY2, tidAY, 8; +--:-:-:-:1 IADD tidAY3, tidAY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.128 load0A, [track0A]; +--:-:2:-:1 @P1 LDG.E.128 load1A, [track1A]; +--:-:3:-:1 @P2 LDG.E.128 load2A, [track2A]; +--:-:4:-:1 @P3 LDG.E.128 load3A, [track3A]; +--:-:5:-:1 @P4 LDG.E.128 loadB, [trackB]; + + + +--:-:6:-:1 @!P0 LDS.U.128 load0A, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 load1A, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.128 load2A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load3A, [addr_zero]; +--:-:6:-:2 @!P4 LDS.U.128 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD txa1, txa, 1; +--:-:-:-:1 IADD txa2, txa, 2; +--:-:-:-:1 IADD txa3, txa, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P4; + +--:-:1:-:1 @P0 LDG.E load0A0, [track0A + 4x<0>]; +--:-:1:-:1 @P1 LDG.E load0A1, [track0A + 4x<1>]; +--:-:1:-:1 @P2 LDG.E load0A2, [track0A + 4x<2>]; +--:-:1:-:1 @P3 LDG.E load0A3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:2:-:1 @P0 LDG.E load1A0, [track1A + 4x<0>]; +--:-:2:-:1 @P1 LDG.E load1A1, [track1A + 4x<1>]; +--:-:2:-:1 @P2 LDG.E load1A2, [track1A + 4x<2>]; +--:-:2:-:1 @P3 LDG.E load1A3, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P6; + +--:-:3:-:1 @P0 LDG.E load2A0, [track2A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E load2A1, [track2A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E load2A2, [track2A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E load2A3, [track2A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:4:-:1 @P0 LDG.E load3A0, [track3A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E load3A1, [track3A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E load3A2, [track3A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E load3A3, [track3A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:5:-:1 @P0 LDG.E loadB0, [trackB + 4x<0>]; +--:-:5:-:1 @P1 LDG.E loadB1, [trackB + 4x<1>]; +--:-:5:-:1 @P2 LDG.E loadB2, [trackB + 4x<2>]; +--:-:5:-:1 @P3 LDG.E loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 15; + + + +21:-:-:-:1 STS.128 [writeAs + 4x<0*128>], load0A; +--:-:-:-:6 IADD track0A0.CC, track0A0, lda16; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS.128 [writeAs + 4x<4*128>], load1A; +--:-:-:-:6 IADD track1A0.CC, track1A0, lda16; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS.128 [writeAs + 4x<8*128>], load2A; +--:-:-:-:6 IADD track2A0.CC, track2A0, lda16; +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS.128 [writeAs + 4x<12*128>], load3A; +--:-:-:-:6 IADD track3A0.CC, track3A0, lda16; +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P5 LDG.E.128 load0A, [track0A]; +--:-:4:-:1 @P5 LDG.E.128 load1A, [track1A]; +--:-:5:-:1 @P5 LDG.E.128 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.128 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P5 LDG.E load0A0, [track0A + 4x<0>]; +--:-:3:-:1 @P5 LDG.E load0A1, [track0A + 4x<1>]; +--:-:3:-:1 @P5 LDG.E load0A2, [track0A + 4x<2>]; +--:-:3:-:1 @P5 LDG.E load0A3, [track0A + 4x<3>]; + +--:-:4:-:1 @P5 LDG.E load1A0, [track1A + 4x<0>]; +--:-:4:-:1 @P5 LDG.E load1A1, [track1A + 4x<1>]; +--:-:4:-:1 @P5 LDG.E load1A2, [track1A + 4x<2>]; +--:-:4:-:1 @P5 LDG.E load1A3, [track1A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E load2A0, [track2A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E load2A1, [track2A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E load2A2, [track2A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E load2A3, [track2A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E load3A0, [track3A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E load3A1, [track3A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E load3A2, [track3A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E load3A3, [track3A + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P6 LDG.E loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P6 LDG.E loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P6 LDG.E loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 0; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n", + j9c6 => "--:5:-:-:1 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n", + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, lda16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, lda16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2A0.CC, track2A0, lda16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, lda16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.128 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.128 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.128 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.128 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E load0A0, [track0A + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E load0A1, [track0A + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E load0A2, [track0A + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E load0A3, [track0A + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E load1A0, [track1A + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E load1A1, [track1A + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E load1A2, [track1A + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E load1A3, [track1A + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E load2A0, [track2A + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E load2A1, [track2A + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E load2A2, [track2A + 4x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E load2A3, [track2A + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E load3A0, [track3A + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E load3A1, [track3A + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E load3A2, [track3A + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E load3A3, [track3A + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E loadB0, [trackB + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E loadB1, [trackB + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E loadB2, [trackB + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/hgemm_common_128x128.sass b/Kernel/SGEMM/Pascal/hgemm_common_128x128.sass new file mode 100644 index 0000000..d699483 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_common_128x128.sass @@ -0,0 +1,412 @@ +# hgemm_common_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +[- + +our $int16; + +sub convert_in { + return $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; +} + + +sub convert_out { + return $int16 ? 'F2I.S16.F32': 'F2F.F16.F32'; +} + + +sub scale_int16 { + return $int16? q{ +--:-:-:-:1 FMUL c0, c0, param_scale; +--:-:-:-:1 FMUL c1, c1, param_scale; +--:-:-:-:1 FMUL c2, c2, param_scale; +--:-:-:-:0 FMUL c3, c3, param_scale; + } : ""; +} + + +sub max_abs1 { + return $int16? q{ +--:-:-:-:1 @!P0 MOV cs0, RZ; +--:-:-:-:1 @!P1 MOV cs1, RZ; +--:-:-:-:1 @!P2 MOV cs2, RZ; +--:-:-:-:1 @!P3 MOV cs3, RZ; + +--:-:-:-:1 @P0 VABSDIFF.S16.S16.MRG_16L cs0, c0, RZ, RZ; +--:-:-:-:1 @P1 VABSDIFF.S16.S16.MRG_16L cs1, c1, RZ, RZ; +--:-:-:-:1 @P2 VABSDIFF.S16.S16.MRG_16L cs2, c2, RZ, RZ; +--:-:-:-:1 @P3 VABSDIFF.S16.S16.MRG_16L cs3, c3, RZ, RZ; + } : ""; +} + + +sub max_abs2 { + return $int16? q{ + + +// a = abs(a) +--:-:-:-:1 @P0 VABSDIFF.S16.S16.MRG_16H cs0, c0, RZ, cs0; +--:-:-:-:1 @P1 VABSDIFF.S16.S16.MRG_16H cs1, c1, RZ, cs1; +--:-:-:-:1 @P2 VABSDIFF.S16.S16.MRG_16H cs2, c2, RZ, cs2; +--:-:-:-:1 @P3 VABSDIFF.S16.S16.MRG_16H cs3, c3, RZ, cs3; + +// max = max(c,d,max(a,b,max)) ... +--:-:-:-:1 VMNMX.UD.U16.U16.MX.MAX maxabs, cs0, cs0.H1, maxabs; +--:-:-:-:1 VMNMX.UD.U16.U16.MX.MAX maxabs, cs1, cs1.H1, maxabs; +--:-:-:-:1 VMNMX.UD.U16.U16.MX.MAX maxabs, cs2, cs2.H1, maxabs; +--:-:-:-:1 VMNMX.UD.U16.U16.MX.MAX maxabs, cs3, cs3.H1, maxabs; + + + } : ""; +} + + +sub butterfly { + return $int16 ? q{ +--:-:-:-:0 LOP.AND.Z P0, RZ, tid, 31; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x10, 0x1f; +01:-:-:-:4 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x8, 0x1f; +01:-:-:-:4 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x4, 0x1f; +01:-:-:-:4 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:-:-:0 MOV Stats0, param_Stats[0]; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x2, 0x1f; +01:-:-:-:4 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:-:-:0 MOV Stats1, param_Stats[1]; +--:-:1:-:2 SHFL.BFLY PT, warp_max, maxabs, 0x1, 0x1f; +01:-:-:-:2 IMNMX maxabs, warp_max, maxabs, !PT; +--:-:-:-:1 @P0 RED.E.MAX [Stats], maxabs; + } : ""; +} + +-] + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>]; + +LOOP: + +[+ + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $yield = $c == 32 ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + ++] + +--:-:-:-:1 IADD loop, loop, 1; +--:-:-:-:1 IADD ta, ta, param_ldaz; +--:-:-:-:1 IADD tb, tb, param_ldbz; +--:-:-:-:3 MOV k, param_k; +--:-:-:-:1 ISETP.LT.AND P1, PT, loop, param_loops, PT; +--:-:-:-:6 LEA trackA0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 1; +--:-:-:-:6 LEA trackB0.CC, tb, param_B[0], 1; +--:-:-:-:0 LEA.HI.X trackB1, tb, param_B[1], RZ, 1; +--:-:-:Y:5 @P1 BRA.U REMAINDER; + + + +// writeCs = (readAs / 4) * 128 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0xfff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xfff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 5; + +--:-:-:-:1 LOP.AND tid_31, tid, 31; +--:-:-:-:1 LOP.AND tid_96, tid, 96; +--:-:-:-:1 LOP.AND tid_128, tid, 128; + +// cx = tid31 | (tid_128 >> 2); +--:-:-:-:1 SHR.U32 cx00, tid_128, 2; +--:-:-:-:1 LOP.OR cx00, tid_31, cx00; + +// readCs = ((tid_96 << 4) | cx) << 2; +--:-:-:-:1 SHL readCs, tid_96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, cx00; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx += blkB*128; +--:-:-:-:1 ISCADD cx00, blkB, cx00, 7; +--:-:-:-:1 IADD cx64, cx00, 64; + +// cy = blkA*128 + (tid_96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid_96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx00, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 1; + +--:-:-:-:1 SHL ldc1, ldc, 1; +--:-:-:-:1 SHL ldc4, ldc, 3; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 7; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; +--:-:-:-:1 MOV maxabs, RZ; + +--:-:-:-:1 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +[+ butterfly() +] +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.S16 d0, [C00y0 + 2x<00>]; +--:-:2:-:1 @P1 LDG.E.S16 d1, [C00y0 + 2x<64>]; +--:-:3:-:1 @P2 LDG.E.S16 d2, [C04y0 + 2x<00>]; +--:-:4:-:1 @P3 LDG.E.S16 d3, [C04y0 + 2x<64>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c4, c4, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c5, c5, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c6, c6, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c7, c7, RZ, !PT; + +--:-:-:-:5 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + + +--:-:-:-:3 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], c4; + +--:-:-:-:0 IADD cy00, cy00, 1; + +--:-:-:-:1 LDS c0, [readCs + 4x<0*128 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<0*128 + 64>]; +--:-:-:-:1 LDS c2, [readCs + 4x<1*128 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<1*128 + 64>]; + +--:-:-:-:0 IADD cy04, cy04, 1; + +01:-:1:-:1 @P6 [+ convert_in() +] d0, d0; +02:-:2:-:1 @P6 [+ convert_in() +] d1, d1; +04:-:3:-:1 @P6 [+ convert_in() +] d2, d2; +08:-:4:-:1 @P6 [+ convert_in() +] d3, d3; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:3 @P6 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +01:-:-:-:1 @P0 STG.E.S16 [C00y0 + 2x<00>], c0; +02:5:-:-:1 @P1 STG.E.S16 [C00y0 + 2x<64>], c1; +04:-:-:-:1 @P2 STG.E.S16 [C04y0 + 2x<00>], c2; +08:6:-:-:1 @P3 STG.E.S16 [C04y0 + 2x<64>], c3; + +[+ max_abs1() +] + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.S16 d0, [C08y0 + 2x<00>]; +--:-:2:-:1 @P1 LDG.E.S16 d1, [C08y0 + 2x<64>]; +--:-:3:-:1 @P2 LDG.E.S16 d2, [C12y0 + 2x<00>]; +--:-:4:-:1 @P3 LDG.E.S16 d3, [C12y0 + 2x<64>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +--:-:-:-:2 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:2 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + + +10:-:-:-:4 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +20:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:0 IADD.X C04y1, C04y1, RZ; + +--:-:-:-:1 LDS c0, [readCs + 4x<2*128 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<2*128 + 64>]; +--:-:-:-:1 LDS c2, [readCs + 4x<3*128 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*128 + 64>]; + +01:-:1:-:4 @P6 [+ convert_in() +] d0, d0; +02:-:2:-:4 @P6 [+ convert_in() +] d1, d1; +04:-:3:-:4 @P6 [+ convert_in() +] d2, d2; +08:-:4:-:1 @P6 [+ convert_in() +] d3, d3; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:3 @P6 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +01:-:-:-:1 @P0 STG.E.S16 [C08y0 + 2x<00>], c0; +02:5:-:-:1 @P1 STG.E.S16 [C08y0 + 2x<64>], c1; +04:-:-:-:1 @P2 STG.E.S16 [C12y0 + 2x<00>], c2; +08:6:-:-:1 @P3 STG.E.S16 [C12y0 + 2x<64>], c3; + +[+ max_abs2() +] + +10:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +20:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_common_128x32.sass b/Kernel/SGEMM/Pascal/hgemm_common_128x32.sass new file mode 100644 index 0000000..9d4860a --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_common_128x32.sass @@ -0,0 +1,246 @@ +# hgemm_common_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 32 + readBs; +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 2; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +--:-:-:-:1 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 1; + +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; + +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P4, RZ, flags, 2; + +--:-:-:-:1 SHL ldc1, ldc, 1; +--:-:-:-:1 SHL ldc4, ldc, 3; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 7; + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.U16 d0, [C00y]; +--:-:2:-:1 @P1 LDG.E.U16 d1, [C04y]; +--:-:3:-:1 @P2 LDG.E.U16 d2, [C08y]; +--:-:4:-:1 @P3 LDG.E.U16 d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:0 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*32>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*32>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*32>]; + + +01:-:1:-:1 @P5 F2F.F32.F16 d0, d0; +02:-:2:-:1 @P5 F2F.F32.F16 d1, d1; +04:-:3:-:1 @P5 F2F.F32.F16 d2, d2; +08:-:4:-:1 @P5 F2F.F32.F16 d3, d3; + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P5 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +01:1:-:-:1 @P0 STG.E.CG.U16 [C00y], c0; +02:2:-:-:1 @P1 STG.E.CG.U16 [C04y], c1; +04:3:-:-:1 @P2 STG.E.CG.U16 [C08y], c2; +08:4:-:-:1 @P3 STG.E.CG.U16 [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_common_128x64.sass b/Kernel/SGEMM/Pascal/hgemm_common_128x64.sass new file mode 100644 index 0000000..a375c03 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_common_128x64.sass @@ -0,0 +1,318 @@ +# hgemm_common_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $yield = $c == 32 ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:1:-:1 S2R threadId, SR_TID.X; +--:-:2:-:1 S2R blockA, SR_CTAID.Y; +--:-:3:-:1 S2R blockB, SR_CTAID.Z; +--:-:4:-:1 S2R blockZ, SR_CTAID.X; + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 64 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0xff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 4; + +// readCs = ((threadId & 96) << 3) | (threadId & 31) << 2; +01:-:-:-:1 LOP.AND tid31, threadId, 31; +01:-:-:-:1 LOP.AND tid96, threadId, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx00 = blkB*64 + tid31; +04:-:-:-:1 ISCADD cx00, blockB, tid31, 6; +--:-:-:-:1 IADD cx32, cx00, 32; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +02:-:-:-:1 ISCADD cy00, blockA, cy00, 7; + +// C += (cy*ldc + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx00, xmad_c; +08:-:-:-:1 XMAD.LO2 ci, ldcz, blockZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 1; + +--:-:-:-:1 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + +--:-:-:-:1 SHL ldc1, ldc, 1; +--:-:-:-:1 SHL ldc4, ldc, 3; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 7; + + + +//--:-:1:-:2 I2F.F32.U32 temp, threadId; +//01:-:-:-:1 F2F.F16.F32 temp, temp; + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.S16 d0, [C00y0 + 2x<00>]; +--:-:2:-:1 @P1 LDG.E.S16 d1, [C00y0 + 2x<32>]; +--:-:3:-:1 @P2 LDG.E.S16 d2, [C04y0 + 2x<00>]; +--:-:4:-:1 @P3 LDG.E.S16 d3, [C04y0 + 2x<32>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c4, c4, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c5, c5, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c6, c6, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c7, c7, RZ, !PT; + +--:-:-:-:5 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + + +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], c4; + +--:-:-:-:0 IADD cy00, cy00, 1; + +--:-:-:-:1 LDS c0, [readCs + 4x<0*64 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<1*64 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<1*64 + 32>]; + +--:-:-:-:0 IADD cy04, cy04, 1; + +01:-:1:-:1 @P6 F2F.F32.F16 d0, d0; +02:-:2:-:1 @P6 F2F.F32.F16 d1, d1; +04:-:3:-:1 @P6 F2F.F32.F16 d2, d2; +08:-:4:-:1 @P6 F2F.F32.F16 d3, d3; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P6 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, P6; + +// Stochastic Round flag +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 1; + +01:-:-:-:1 @P0 STG.E.S16 [C00y0 + 2x<00>], c0; +02:5:-:-:1 @P1 STG.E.S16 [C00y0 + 2x<32>], c1; +04:-:-:-:1 @P2 STG.E.S16 [C04y0 + 2x<00>], c2; +08:6:-:-:1 @P3 STG.E.S16 [C04y0 + 2x<32>], c3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.S16 d0, [C08y0 + 2x<00>]; +--:-:2:-:1 @P1 LDG.E.S16 d1, [C08y0 + 2x<32>]; +--:-:3:-:1 @P2 LDG.E.S16 d2, [C12y0 + 2x<00>]; +--:-:4:-:1 @P3 LDG.E.S16 d3, [C12y0 + 2x<32>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, PT; + +--:-:-:-:2 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:2 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + + +10:-:-:-:4 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +20:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:0 IADD.X C04y1, C04y1, RZ; + +--:-:-:-:1 LDS c0, [readCs + 4x<2*64 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<3*64 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*64 + 32>]; + +01:-:1:-:1 @P6 F2F.F32.F16 d0, d0; +02:-:2:-:1 @P6 F2F.F32.F16 d1, d1; +04:-:3:-:1 @P6 F2F.F32.F16 d2, d2; +08:-:4:-:1 @P6 F2F.F32.F16 d3, d3; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P6 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +01:-:-:-:1 @P0 STG.E.S16 [C08y0 + 2x<00>], c0; +02:5:-:-:1 @P1 STG.E.S16 [C08y0 + 2x<32>], c1; +04:-:-:-:1 @P2 STG.E.S16 [C12y0 + 2x<00>], c2; +08:6:-:-:1 @P3 STG.E.S16 [C12y0 + 2x<32>], c3; + +10:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +20:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_common_32x128.sass b/Kernel/SGEMM/Pascal/hgemm_common_32x128.sass new file mode 100644 index 0000000..3661b08 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_common_32x128.sass @@ -0,0 +1,244 @@ +# Kernel: hgemm_common_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*32 + 16 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 128 + readBs; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 5; + +// readCs = tid * 4; +--:-:-:-:1 SHL readCs, tid, 2; + +// cx = blkB*128 + tid; +--:-:-:-:1 ISCADD cx, blkB, tid, 7; + +// cy = blkA*32 +--:-:-:-:1 SHL cy00, blkA, 5; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; + +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 1; + +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; + +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P4, RZ, flags, 2; + +--:-:-:-:1 SHL ldc1, ldc, 1; +--:-:-:-:1 SHL ldc4, ldc, 3; +--:-:-:-:1 ISCADD ldc12, ldc, -ldc4, 5; + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc12;\n" . + "--:-:-:-:1 IADD cy00, cy00, 12;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc12;\n" . + "--:-:-:-:1 IADD cy04, cy04, 12;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc12;\n" . + "--:-:-:-:1 IADD cy08, cy08, 12;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc12;\n" . + "--:-:-:-:1 IADD cy12, cy12, 12;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E.U16 d0, [C00y]; +--:-:2:-:1 @P1 LDG.E.U16 d1, [C04y]; +--:-:3:-:1 @P2 LDG.E.U16 d2, [C08y]; +--:-:4:-:1 @P3 LDG.E.U16 d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:0 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*128>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*128>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*128>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*128>]; + + +01:-:1:-:1 @P5 F2F.F32.F16 d0, d0; +02:-:2:-:1 @P5 F2F.F32.F16 d1, d1; +04:-:3:-:1 @P5 F2F.F32.F16 d2, d2; +08:-:4:-:1 @P5 F2F.F32.F16 d3, d3; + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P5 FFMA c3, d3, beta, c3; + +--:-:1:-:1 F2F.F16.F32 c0, c0; +--:-:2:-:1 F2F.F16.F32 c1, c1; +--:-:3:-:1 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +01:1:-:-:1 @P0 STG.E.CG.U16 [C00y], c0; +02:2:-:-:1 @P1 STG.E.CG.U16 [C04y], c1; +04:3:-:-:1 @P2 STG.E.CG.U16 [C08y], c2; +08:4:-:-:1 @P3 STG.E.CG.U16 [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_128x128.sass b/Kernel/SGEMM/Pascal/hgemm_nn_128x128.sass new file mode 100644 index 0000000..0b4f460 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nn_128x128.sass @@ -0,0 +1,393 @@ +# Kernel: hgemm_nn_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- +our $int16; +our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; +sub convert_in {return $convert;} + +sub int16_params { + return $int16 ? q{ +param_Stats[0] : c[0x0][0x190] +param_Stats[1] : c[0x0][0x194] +param_scale : c[0x0][0x198] + } : ""; +} +-] + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + [+ int16_params() +] + + + + + 64-95 ~ tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, tid31, tid128, txa, xmad_ta, xmad_tb, k<1-3>, x<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-105 : loadB<0-3>, loadA<0-5> + + 106-109 : trackA<0-1>, trackB<0-1> + + 110-118 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop + 119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ + + 64-75 ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128 + + 64-79 : c<0-7>, d3, d2, d1, d0, cs<0-3> + 64-65 : Stats<0-1> + 80-89 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 90-118 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs + + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 MOV loop, RZ; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); ++] + +01:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid128, tid, 128; + +// tidAY = (tid & 1) << 2 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; + +// tidAX = tid >> 1 +--:-:-:-:1 SHR.U32 tidAX, tid, 1; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) & 7 +--:-:-:-:1 SHL tidBX, tid31, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x305; // 3 bits at position 5 + +// trackB += (blkB*128 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 7; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeAs = 4 * (128 * tidAY + tidAX) +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<128*8*2>, 2; + + +// writeBs = (128*tidBY + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 7; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<128*8*3>, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + +[+ + our $vec; + return $vec ? q{ +--:-:-:-:2 ISETP.LT.AND P3, PT, tidBY, k, P6; +--:-:-:Y:b ISETP.LT.AND P2, PT, tidAY, k, P5; + +--:-:4:-:2 @P3 LDG.E.CI.64 loadB0, [trackB]; +--:-:2:-:1 @P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>]; +--:-:2:-:1 @P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>]; + +--:-:-:-:0 PSETP.AND.AND P4, PT, PT, PT, PT; + +--:-:5:-:1 @!P3 LDS.U.64 loadB0, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.64 loadA0, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.64 loadA4, [addr_zero]; + } : q{ + + +// doLoad0 = tidBY < k +--:-:-:-:1 IADD x1, txb, 1; +--:-:-:-:1 IADD x2, txb, 2; +--:-:-:-:1 IADD x3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_n, P0; + +--:-:4:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>]; +--:-:4:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>]; +--:-:4:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>]; +--:-:4:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + +--:-:-:-:1 IADD k1, tidAY, 1; +--:-:-:-:1 IADD k2, tidAY, 2; +--:-:-:-:1 IADD k3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + + }; ++] + +[+ + our $vec; + our $convert; + return $vec ? qq{ +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 7; + +18:-:-:-:4 $convert loadB3, loadB1.H1; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 $convert loadB2, loadB1.H0; +--:-:-:-:4 $convert loadB1, loadB0.H1; +--:-:4:-:2 $convert loadB0, loadB0.H0; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +08:-:-:-:1 STS.128 [writeBs], loadB0; + +22:-:-:-:4 $convert loadA3, loadA1.H1; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<16>; +--:-:2:-:4 $convert loadA2, loadA1.H0; +--:-:-:-:4 $convert loadA1, loadA0.H1; +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; +--:-:3:-:1 $convert loadA0, loadA0.H0; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<3*128>], loadA3; +--:-:-:-:1 STS [writeAs + 4x<2*128>], loadA2; +04:-:-:-:1 STS [writeAs + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<0*128>], loadA0; + } : qq{ +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +08:-:-:-:4 $convert loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 $convert loadB1, loadB1; +--:-:-:-:4 $convert loadB2, loadB2; +--:-:4:-:2 $convert loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +08:-:-:-:1 STS.128 [writeBs], loadB0; + +02:-:-:-:4 $convert loadA0, loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<8>; +--:-:2:-:4 $convert loadA1, loadA1; +--:-:-:-:4 $convert loadA2, loadA2; +--:-:3:-:1 $convert loadA3, loadA3; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeAs + 4x<1*128>], loadA1; +04:-:-:-:1 STS [writeAs + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*128>], loadA3; + }; ++] + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 LOP.XOR writeAs, writeAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR writeBs, writeBs, 4x<128*8*2>; + + + +[+ + our $vec; + our $convert; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P4, PT, !P4, PT, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c15 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P4, P5;\n", + + j0c10 => "--:-:2:-:1 \@P3 LDG.E.CI.64 loadB0, [trackB];\n", + + j0c28 => "--:-:5:-:1 \@P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];\n", + j0c30 => "20:4:6:-:1 \@P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];\n", + + j4c5 => "--:-:-:-:1 \@!P4 $convert loadA3, loadA5.H1;\n", + j4c9 => "--:-:-:-:1 \@!P4 $convert loadA2, loadA5.H0;\n", + j4c13 => "--:-:-:-:1 \@!P4 $convert loadA1, loadA4.H1;\n", + j4c17 => "--:-:-:-:1 \@!P4 $convert loadA0, loadA4.H0;\n", + + j5c5 => "02:-:-:-:1 \@P0 $convert loadB3, loadB1.H1;\n", + j5c9 => "--:-:-:-:1 \@P0 $convert loadB2, loadB1.H0;\n", + j5c13 => "--:-:-:-:1 \@P0 $convert loadB1, loadB0.H1;\n", + j5c17 => "--:-:2:-:1 \@P0 $convert loadB0, loadB0.H0;\n", + + j5c35 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c5 => "10:-:2:-:1 \@P4 $convert loadA3, loadA1.H1;\n", + j6c9 => "--:-:3:-:1 \@P4 $convert loadA2, loadA1.H0;\n", + j6c13 => "--:-:4:-:1 \@P4 $convert loadA1, loadA0.H1;\n", + j6c17 => "--:-:5:-:1 \@P4 $convert loadA0, loadA0.H0;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n", + j6c31 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n", + j6c33 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n", + j6c35 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n", + + j6c11 => "08:-:-:-:1 \@P4 IADD trackA0.CC, trackA0, 2x<16>;\n", + j6c54 => "--:-:-:-:1 \@P4 IADD.X trackA1, trackA1, RZ;\n", + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + j0c10 => "--:-:2:-:1 \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j0c12 => "--:-:2:-:1 \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j0c14 => "--:-:2:-:1 \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j0c16 => "--:-:2:-:1 \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j0c29 => "--:-:6:-:1 \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n", + j0c31 => "--:-:6:-:1 \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n", + j0c33 => "--:-:6:-:1 \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n", + j0c35 => "--:-:6:-:1 \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n", + + j5c8 => "02:-:-:-:1 \@P3 $convert loadB0, loadB0;\n", + j5c12 => "--:-:-:-:1 \@P3 $convert loadB1, loadB1;\n", + j5c16 => "--:-:-:-:1 \@P3 $convert loadB2, loadB2;\n", + j5c20 => "--:-:2:-:1 \@P3 $convert loadB3, loadB3;\n", + + j5c39 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c5 => "20:-:2:-:1 \@P2 $convert loadA0, loadA0;\n", + j6c9 => "--:-:3:-:1 \@P2 $convert loadA1, loadA1;\n", + j6c13 => "--:-:4:-:1 \@P2 $convert loadA2, loadA2;\n", + j6c17 => "--:-:5:-:1 \@P2 $convert loadA3, loadA3;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n", + j6c31 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n", + j6c33 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n", + j6c35 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n", + + j6c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 2x<8>;\n", + j6c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + ) + ), + + j5c46 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:0 IADD32I k, k, -8;\n" . + "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; ++] + + diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_128x32.sass b/Kernel/SGEMM/Pascal/hgemm_nn_128x32.sass new file mode 100644 index 0000000..33a4a9a --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nn_128x32.sass @@ -0,0 +1,590 @@ +# Kernel: hgemm_nn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + 32*16*2> + szShareA : (128*16 + 32) + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, shiftAX + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb16, ldb, 5; +--:-:-:-:1 SHL lda32, lda, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa00, blkA, tidAX, 7; +--:-:-:-:1 IADD txa32, txa00, 32; +--:-:-:-:1 IADD txa64, txa00, 64; +--:-:-:-:1 IADD txa96, txa00, 96; + +--:-:-:-:1 XMAD.LO ta00, lda, txa00, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta00, ldaz, blkZ, ta00; +--:-:-:-:1 IADD ta32, ta00, lda32; +--:-:-:-:1 IADD ta64, ta32, lda32; +--:-:-:-:1 IADD ta96, ta64, lda32; + +--:-:-:-:1 LEA track0A0.CC, ta00, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta00, param_A[1], RZ, 1; +--:-:-:-:1 LEA track1A0.CC, ta32, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track1A1, ta32, param_A[1], RZ, 1; +--:-:-:-:1 LEA track2A0.CC, ta64, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track2A1, ta64, param_A[1], RZ, 1; +--:-:-:-:1 LEA track3A0.CC, ta96, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track3A1, ta96, param_A[1], RZ, 1; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 1; + +// writeAs = (tidAY*128 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.CI.64 load0A, [track0A]; +--:-:2:-:1 @P3 LDG.E.CI.64 load1A, [track1A]; +--:-:3:-:1 @P4 LDG.E.CI.64 load2A, [track2A]; +--:-:4:-:1 @P5 LDG.E.CI.64 load3A, [track3A]; +--:-:5:-:1 @P6 LDG.E.CI.64 loadB, [trackB]; + + + +--:-:6:-:1 @!P2 LDS.U.64 load0A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 load1A, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.64 load2A, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.64 load3A, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.64 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:1:-:1 @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:1:-:1 @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:1:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:1:-:1 @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI.U16 load2A0, [track2A + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.U16 load2A1, [track2A + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load2A2, [track2A + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load2A3, [track2A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI.U16 load3A0, [track3A + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.U16 load3A1, [track3A + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.U16 load3A2, [track3A + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load3A3, [track3A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:5:-:1 @P0 LDG.E.CI.U16 loadB0, [trackB + 2x<0>]; +--:-:5:-:1 @P1 LDG.E.CI.U16 loadB1, [trackB + 2x<1>]; +--:-:5:-:1 @P2 LDG.E.CI.U16 loadB2, [trackB + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 loadB3, [trackB + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P1, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + + + + + our $vec; + return $vec ? q{ +21:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:1:-:1 F2F.F32.F16 load0A0, load0A0.H0; + +02:-:-:-:1 F2F.F32.F16 load1A3, load1A1.H1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A1.H0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A0.H1; +--:-:2:-:1 F2F.F32.F16 load1A0, load1A0.H0; + +04:-:-:-:1 F2F.F32.F16 load2A3, load2A1.H1; +--:-:-:-:1 F2F.F32.F16 load2A2, load2A1.H0; +--:-:-:-:1 F2F.F32.F16 load2A1, load2A0.H1; +--:-:3:-:1 F2F.F32.F16 load2A0, load2A0.H0; + +08:-:-:-:1 F2F.F32.F16 load3A3, load3A1.H1; +--:-:-:-:1 F2F.F32.F16 load3A2, load3A1.H0; +--:-:-:-:1 F2F.F32.F16 load3A1, load3A0.H1; +--:-:4:-:1 F2F.F32.F16 load3A0, load3A0.H0; + +10:-:-:-:1 F2F.F32.F16 loadB3, loadB1.H1; +--:-:-:-:1 F2F.F32.F16 loadB2, loadB1.H0; +--:-:-:-:1 F2F.F32.F16 loadB1, loadB0.H1; +--:-:5:-:1 F2F.F32.F16 loadB0, loadB0.H0; + } : q{ +21:-:-:-:1 F2F.F32.F16 load0A0, load0A0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:1:-:1 F2F.F32.F16 load0A3, load0A3; + +02:-:-:-:1 F2F.F32.F16 load1A0, load1A0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A2; +--:-:2:-:1 F2F.F32.F16 load1A3, load1A3; + +04:-:-:-:1 F2F.F32.F16 load2A0, load2A0; +--:-:-:-:1 F2F.F32.F16 load2A1, load2A1; +--:-:-:-:1 F2F.F32.F16 load2A2, load2A2; +--:-:3:-:1 F2F.F32.F16 load2A3, load2A3; + +08:-:-:-:1 F2F.F32.F16 load3A0, load3A0; +--:-:-:-:1 F2F.F32.F16 load3A1, load3A1; +--:-:-:-:1 F2F.F32.F16 load3A2, load3A2; +--:-:4:-:1 F2F.F32.F16 load3A3, load3A3; + +10:-:-:-:1 F2F.F32.F16 loadB0, loadB0; +--:-:-:-:1 F2F.F32.F16 loadB1, loadB1; +--:-:-:-:1 F2F.F32.F16 loadB2, loadB2; +--:-:5:-:1 F2F.F32.F16 loadB3, loadB3; + }; + + +01:-:-:-:1 STS [writeAs + 4x<0*128 + 0*32>], load0A0; +--:-:-:-:0 IADD track0A0.CC, track0A0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 0*32>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 0*32>], load0A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 0*32>], load0A3; + +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128 + 1*32>], load1A0; +--:-:-:-:0 IADD track1A0.CC, track1A0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 1*32>], load1A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 1*32>], load1A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 1*32>], load1A3; + +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 2*32>], load2A0; +--:-:-:-:0 IADD track2A0.CC, track2A0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 2*32>], load2A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 2*32>], load2A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 2*32>], load2A3; + +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 3*32>], load3A0; +--:-:-:-:0 IADD track3A0.CC, track3A0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 3*32>], load3A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 3*32>], load3A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 3*32>], load3A3; + +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.64 load0A, [track0A]; +--:-:4:-:1 @P3 LDG.E.CI.64 load1A, [track1A]; +--:-:5:-:1 @P4 LDG.E.CI.64 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.CI.64 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.64 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:5:-:1 @P4 LDG.E.CI.U16 load2A0, [track2A + 2x<0>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2A1, [track2A + 2x<1>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2A2, [track2A + 2x<2>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2A3, [track2A + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n", + + j9c6 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n", + + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 2x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2A0.CC, track2A0, 2x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, 2x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.64 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI.64 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.64 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.64 loadB, [trackB];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0A3, load0A1.H1;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A2, load0A1.H0;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A1, load0A0.H1;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1A3, load1A1.H1;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A2, load1A1.H0;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A1, load1A0.H1;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1A0, load1A0.H0;\n", + + j6c13 => "10:-:-:-:1 \@P4 F2F.F32.F16 load2A3, load2A1.H1;\n", + j6c17 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2A2, load2A1.H0;\n", + j6c21 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2A1, load2A0.H1;\n", + j6c25 => "--:-:5:-:1 \@P4 F2F.F32.F16 load2A0, load2A0.H0;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A3, load3A1.H1;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A2, load3A1.H0;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A1, load3A0.H1;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3A0, load3A0.H0;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];\n", + j9c31 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];\n", + j10c1 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];\n", + j10c3 => "--:-:5:-:1 \@P4 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0A0, load0A0;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A1, load0A1;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A2, load0A2;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0A3, load0A3;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1A0, load1A0;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A1, load1A1;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A2, load1A2;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1A3, load1A3;\n", + + j6c13 => "10:-:-:-:1 \@P4 F2F.F32.F16 load2A0, load2A0;\n", + j6c17 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2A1, load2A1;\n", + j6c21 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2A2, load2A2;\n", + j6c25 => "--:-:5:-:1 \@P4 F2F.F32.F16 load2A3, load2A3;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A0, load3A0;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A1, load3A1;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A2, load3A2;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3A3, load3A3;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB3;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB2;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB0, loadB0;\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_128x64.sass b/Kernel/SGEMM/Pascal/hgemm_nn_128x64.sass new file mode 100644 index 0000000..8e6c457 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nn_128x64.sass @@ -0,0 +1,438 @@ +# Kernel: hgemm_nn_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 64*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ tid, blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, txa, txa1, ta, xmad_ta, tb, xmad_tb, tidAY, tidBY, k<1-3>, x<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-111 : loadA<0-7>, loadAA<0-3>, loadB<0-3> + + 112-117 : track0A<0-1>, track1A<0-1>, trackB<0-1> + + 118-122 ~ writeAs, writeBs, k, txb, swapBuf + 123-127 : readAs, readBs + + 64-83 ~ ldc, ldcz, ci, xmad_c, threadId, tid31, tid96, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C04y<0-1>, C08y<0-1>, C12y<0-1>, C00y<0-1> + 86-107 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +// tidAX = tid & 0xfe +// tidAY = (tid & 1) << 2 +01:-:-:-:1 LOP.AND tidAX, tid, 0xfe; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 1; +--:-:-:-:1 LEA track1A0.CC, lda, track0A0, 1; +--:-:-:-:1 LEA.HI.X track1A1, lda, track0A1, RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa, param_m, PT; +--:-:-:-:1 IADD txa1, txa, 1; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa1, param_m, PT; + +// tidBX = (tid & 15) << 2 +// tidBY = (tid >> 4) & 7 +--:-:-:-:1 LOP.AND tidBX, tid, 15; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4 + +// trackB += (blkB*64 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 1; +--:-:-:-:2 LEA.HI.X trackB1, tb, param_B[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// Start the write buffers high +// writeAs = (128*tidAY + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2; +// writeBs = (64*tidBY + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2; + +// Start the read buffers low +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x<64*8 + 128*8>; + + +REMAINDER: + + + our $vec; + return $vec ? q{ +--:-:6:-:1 @P6 LDG.E.CI.64 loadB0, [trackB]; + +--:-:2:-:1 @P5 LDG.E.CI.64 loadA2, [track1A + 2x<0>]; +--:-:2:-:1 @P5 LDG.E.CI.64 loadAA2, [track1A + 2x<8>]; + +--:-:3:-:1 @P4 LDG.E.CI.64 loadA0, [track0A + 2x<0>]; +--:-:3:-:1 @P4 LDG.E.CI.64 loadAA0, [track0A + 2x<8>]; + +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +--:-:4:-:1 @!P6 LDS.U.64 loadB0, [addr_zero]; +--:-:5:-:1 @!P5 LDS.U.64 loadA2, [addr_zero]; +--:-:5:-:1 @!P4 LDS.U.64 loadA0, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.64 loadAA2, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.64 loadAA0, [addr_zero]; + } : q{ + +--:-:2:-:2 S2R tid, SR_TID.X; + + +02:-:-:-:1 LOP.AND tidAY, tid, 1; +--:-:-:-:1 SHL tidAY, tidAY, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4 + +// doLoad0 = tidBY < k +--:-:-:-:1 IADD x1, txb, 1; +--:-:-:-:1 IADD x2, txb, 2; +--:-:-:-:1 IADD x3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_n, P0; + +--:-:6:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<0>]; +--:-:6:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<1>]; +--:-:6:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<2>]; +--:-:6:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + +--:-:-:-:1 IADD k1, tidAY, 1; +--:-:-:-:1 IADD k2, tidAY, 2; +--:-:-:-:1 IADD k3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P4; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [track0A + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA2, [track0A + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA4, [track0A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA6, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA2, RZ; +--:-:-:-:1 @!P2 MOV loadA4, RZ; +--:-:-:-:1 @!P3 MOV loadA6, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:3:-:1 @P0 LDG.E.CI.S16 loadA1, [track1A + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.S16 loadA3, [track1A + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.S16 loadA5, [track1A + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.S16 loadA7, [track1A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA1, RZ; +--:-:-:-:1 @!P1 MOV loadA3, RZ; +--:-:-:-:1 @!P2 MOV loadA5, RZ; +--:-:-:-:1 @!P3 MOV loadA7, RZ; + + }; + + + + our $vec; + return $vec ? q{ +28:-:-:-:4 F2F.F32.F16 loadB3, loadB1.H1; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 F2F.F32.F16 loadB2, loadB1.H0; +--:-:-:-:4 F2F.F32.F16 loadB1, loadB0.H1; +--:-:4:-:2 F2F.F32.F16 loadB0, loadB0.H0; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +08:-:-:-:1 STS.128 [writeBs], loadB0; + +12:-:-:-:4 F2F.F32.F16 loadA7, loadA3.H1; +04:-:2:-:4 F2F.F32.F16 loadA6, loadA1.H1; +--:-:-:-:0 IADD track0A0.CC, track0A0, 2x<16>; +--:-:-:-:4 F2F.F32.F16 loadA5, loadA3.H0; +--:-:3:-:4 F2F.F32.F16 loadA4, loadA1.H0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; +--:-:-:-:4 F2F.F32.F16 loadA3, loadA2.H1; +--:-:-:-:0 IADD track1A0.CC, track1A0, 2x<16>; +--:-:-:-:4 F2F.F32.F16 loadA1, loadA2.H0; +--:-:4:-:4 F2F.F32.F16 loadA2, loadA0.H1; +--:-:5:-:1 F2F.F32.F16 loadA0, loadA0.H0; + +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +02:-:-:-:1 STS.64 [writeAs + 4x<3*128>], loadA6; +04:-:-:-:1 STS.64 [writeAs + 4x<2*128>], loadA4; +08:-:-:-:1 STS.64 [writeAs + 4x<1*128>], loadA2; +10:-:-:-:1 STS.64 [writeAs + 4x<0*128>], loadA0; + + } : q{ + +20:-:-:-:4 F2F.F32.F16 loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 F2F.F32.F16 loadB1, loadB1; +--:-:-:-:4 F2F.F32.F16 loadB2, loadB2; +--:-:6:-:2 F2F.F32.F16 loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +20:-:-:-:1 STS.128 [writeBs], loadB0; + +02:-:-:-:4 F2F.F32.F16 loadA0, loadA0; +04:-:2:-:4 F2F.F32.F16 loadA1, loadA1; +--:-:-:-:0 IADD track0A0.CC, track0A0, 2x<8>; +--:-:-:-:4 F2F.F32.F16 loadA2, loadA2; +--:-:3:-:4 F2F.F32.F16 loadA3, loadA3; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; +--:-:-:-:4 F2F.F32.F16 loadA4, loadA4; +--:-:-:-:0 IADD track1A0.CC, track1A0, 2x<8>; +--:-:4:-:4 F2F.F32.F16 loadA5, loadA5; +--:-:-:-:4 F2F.F32.F16 loadA6, loadA6; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; +--:-:5:-:1 F2F.F32.F16 loadA7, loadA7; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +02:-:-:-:1 STS.64 [writeAs + 4x<0*128>], loadA0; +04:-:-:-:1 STS.64 [writeAs + 4x<1*128>], loadA2; +08:-:-:-:1 STS.64 [writeAs + 4x<2*128>], loadA4; +10:-:-:-:1 STS.64 [writeAs + 4x<3*128>], loadA6; + }; + + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + + + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c15 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P5;\n", + + j0c10 => "--:-:2:-:1 \@P3 LDG.E.CI.64 loadB0, [trackB];\n", + + j0c28 => "--:-:4:-:1 \@P2 LDG.E.CI.64 loadA2, [track1A + 2x<0>];\n", + j0c30 => "--:-:4:-:1 \@P2 LDG.E.CI.64 loadAA2, [track1A + 2x<8>];\n", + + j0c31 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P4;\n", + + j0c44 => "--:-:5:-:1 \@P2 LDG.E.CI.64 loadA0, [track0A + 2x<0>];\n", + j0c46 => "--:-:6:-:1 \@P2 LDG.E.CI.64 loadAA0, [track0A + 2x<8>];\n", + + j3c53 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA7, loadAA3.H1;\n", + j3c57 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA6, loadAA1.H1;\n", + j3c61 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA5, loadAA3.H0;\n", + j4c1 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA4, loadAA1.H0;\n", + j4c5 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA3, loadAA2.H1;\n", + j4c9 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA1, loadAA2.H0;\n", + j4c13 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA2, loadAA0.H1;\n", + j4c17 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA0, loadAA0.H0;\n", + + j5c5 => "02:-:-:-:1 \@P3 F2F.F32.F16 loadB3, loadB1.H1;\n", + j5c9 => "--:-:-:-:1 \@P3 F2F.F32.F16 loadB2, loadB1.H0;\n", + j5c13 => "--:-:-:-:1 \@P3 F2F.F32.F16 loadB1, loadB0.H1;\n", + j5c17 => "--:-:2:-:1 \@P3 F2F.F32.F16 loadB0, loadB0.H0;\n", + + j5c35 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j5c53 => "08:-:-:-:1 \@P1 F2F.F32.F16 loadA7, loadA3.H1;\n", + j5c57 => "10:-:2:-:1 \@P1 F2F.F32.F16 loadA6, loadA1.H1;\n", + j5c61 => "--:-:-:-:1 \@P1 F2F.F32.F16 loadA5, loadA3.H0;\n", + j6c1 => "--:-:3:-:1 \@P1 F2F.F32.F16 loadA4, loadA1.H0;\n", + j6c5 => "--:-:-:-:1 \@P1 F2F.F32.F16 loadA3, loadA2.H1;\n", + j6c9 => "--:-:-:-:1 \@P1 F2F.F32.F16 loadA1, loadA2.H0;\n", + j6c13 => "--:-:4:-:1 \@P1 F2F.F32.F16 loadA2, loadA0.H1;\n", + j6c17 => "--:-:5:-:1 \@P1 F2F.F32.F16 loadA0, loadA0.H0;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS.64 [writeAs + 4x<3*128>], loadA6;\n", + j6c31 => "04:-:-:-:1 \@P0 STS.64 [writeAs + 4x<2*128>], loadA4;\n", + j6c33 => "08:-:-:-:1 \@P0 STS.64 [writeAs + 4x<1*128>], loadA2;\n", + j6c35 => "10:-:-:-:1 \@P0 STS.64 [writeAs + 4x<0*128>], loadA0;\n", + + j6c46 => "--:-:-:-:1 \@P1 IADD track1A0.CC, track1A0, 2x<16>;\n", + j6c54 => "--:-:-:-:1 \@P1 IADD.X track1A1, track1A1, RZ;\n", + j7c55 => "20:-:-:-:1 \@P1 IADD track0A0.CC, track0A0, 2x<16>;\n", + j7c61 => "--:-:-:-:1 \@P1 IADD.X track0A1, track0A1, RZ;\n", + + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P4;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + j0c10 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j0c12 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j0c14 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j0c16 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j0c33 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA0, [track0A + 2x<0>];\n", + j0c35 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA2, [track0A + 2x<1>];\n", + j0c37 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA4, [track0A + 2x<2>];\n", + j0c39 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA6, [track0A + 2x<3>];\n", + + j0c41 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + + j1c29 => "--:-:3:-:1 \@P2 LDG.E.CI.S16 loadA1, [track1A + 2x<0>];\n", + j1c31 => "--:-:3:-:1 \@P2 LDG.E.CI.S16 loadA3, [track1A + 2x<1>];\n", + j1c33 => "--:-:3:-:1 \@P2 LDG.E.CI.S16 loadA5, [track1A + 2x<2>];\n", + j1c35 => "--:-:3:-:1 \@P2 LDG.E.CI.S16 loadA7, [track1A + 2x<3>];\n", + + j5c8 => "20:-:-:-:1 \@P3 F2F.F32.F16 loadB0, loadB0;\n", + j5c12 => "--:-:-:-:1 \@P3 F2F.F32.F16 loadB1, loadB1;\n", + j5c16 => "--:-:-:-:1 \@P3 F2F.F32.F16 loadB2, loadB2;\n", + j5c20 => "--:-:6:-:1 \@P3 F2F.F32.F16 loadB3, loadB3;\n", + + j5c39 => "20:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j5c53 => "02:-:-:-:1 \@P0 F2F.F32.F16 loadA0, loadA0;\n", + j5c57 => "04:-:2:-:1 \@P0 F2F.F32.F16 loadA1, loadA1;\n", + j5c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 loadA2, loadA2;\n", + j6c1 => "--:-:3:-:1 \@P0 F2F.F32.F16 loadA3, loadA3;\n", + j6c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 loadA4, loadA4;\n", + j6c9 => "--:-:4:-:1 \@P0 F2F.F32.F16 loadA5, loadA5;\n", + j6c13 => "--:-:-:-:1 \@P0 F2F.F32.F16 loadA6, loadA6;\n", + j6c17 => "--:-:5:-:1 \@P0 F2F.F32.F16 loadA7, loadA7;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS.64 [writeAs + 4x<0*128>], loadA0;\n", + j6c31 => "04:-:-:-:1 \@P0 STS.64 [writeAs + 4x<1*128>], loadA2;\n", + j6c33 => "08:-:-:-:1 \@P0 STS.64 [writeAs + 4x<2*128>], loadA4;\n", + j6c35 => "10:-:-:-:1 \@P0 STS.64 [writeAs + 4x<3*128>], loadA6;\n", + + j6c46 => "--:-:-:-:1 \@P0 IADD track0A0.CC, track0A0, 2x<8>;\n", + j6c54 => "--:-:-:-:1 \@P0 IADD.X track0A1, track0A1, RZ;\n", + j6c55 => "--:-:-:-:1 \@P0 IADD track1A0.CC, track1A0, 2x<8>;\n", + j6c61 => "--:-:-:-:1 \@P0 IADD.X track1A1, track1A1, RZ;\n", + ) + ), + + j5c46 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:0 IADD32I k, k, -8;\n" . + "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + (j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n") : + (j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n") + ), + ); + return; + + + diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_16x64.sass b/Kernel/SGEMM/Pascal/hgemm_nn_16x64.sass new file mode 100644 index 0000000..1dfb949 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nn_16x64.sass @@ -0,0 +1,1171 @@ +# Kernel: hgemm_nn_16x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(16*64 + 32)*2 + 64*64*2> + szShareA : (16*64 + 32) + szShareB : (64*64) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24|32|40|48|56>, tid16_8, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta + + 96-135 : load0A<0-7>, load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>, load4B<0-3>, load5B<0-3>, load6B<0-3>, load7B<0-3> + 136-153 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>, track4B<0-1>, track5B<0-1>, track6B<0-1>, track7B<0-1> + + 154-161 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb64 + 162-171 ~ tid, blkA, blkB, blkZ, writeCs, preds, tid16 + + 0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3> + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + 96-99 : loadC<0-3> + 100-103 : b<0-3> + 104-107 : c<0-3> + 108-109 : C<0-1> + 110-161 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb8, ldb, 3; +--:-:-:-:1 SHL ldb64, ldb, 7; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// tidAX = tid >> 3 +// tidAY = (tid & 7) << 3 +// shiftAX = (tid & 7) << 2 +01:-:-:-:1 SHR.U32 tidAX, tid, 3; +--:-:-:-:1 LOP.AND tidAY, tid, 7; +--:-:-:-:1 SHL shiftAX, tidAY, 2; +--:-:-:-:1 SHL tidAY, tidAY, 3; + +// tidBX = (tid & 15) << 2 +// tidBY = tid >> 4 +01:-:-:-:1 LOP.AND tidBX, tid, 15; +--:-:-:-:1 SHL tidBX, tidBX, 2; +01:-:-:-:1 SHR.U32 tidBY, tid, 4; + +--:-:-:-:1 IADD tidBY8, tidBY, 8; +--:-:-:-:1 IADD tidBY16, tidBY, 16; +--:-:-:-:1 IADD tidBY24, tidBY, 24; +--:-:-:-:1 IADD tidBY32, tidBY, 32; +--:-:-:-:1 IADD tidBY40, tidBY, 40; +--:-:-:-:1 IADD tidBY48, tidBY, 48; +--:-:-:-:1 IADD tidBY56, tidBY, 56; + +// trackA += ((blkA*16 + tidAX) * lda + tidAY) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 4; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa, param_m, PT; + +// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; + +--:-:-:-:1 LEA track0B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track1B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track2B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track3B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track4B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track4B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track5B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track5B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track6B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track6B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track7B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track7B1, tb, param_B[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P3, PT, txb, param_n, PT; +[+ + our $vec; + return $vec ? '' : q{ +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb3, param_n, PT; + }; ++] +--:-:-:-:1 P2R preds, PR, RZ, 0x7c; + +// writeAs = (tidAY*16 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 4; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (tidBY*64 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (tid & 1) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHL readBs, readBs, 4; + +// tid16 = tid & -16 +// tid16_8 = tid16 / 2 * 4 +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHL tid16_8, tid16, 1; + +// writeCs = (readAs + tid16*2) * 64 + readBs; +--:-:-:-:1 ISCADD writeCs, tid16, readAs, 1; +--:-:-:-:1 ISCADD writeCs, writeCs, readBs, 6; + +// Each block of 16 threads works on 8 lines, readAs is also shifted over by 4 +// readAs += tid16_8 * 16 + tid16 +// readBs += tid16_8 * 64 + 4x +--:-:-:-:1 ISCADD readAs, tid16_8, readAs, 4; +--:-:-:-:1 ISCADD readBs, tid16_8, readBs, 6; +--:-:-:-:1 IADD readAs, tid16, readAs; +--:-:-:-:1 IADD readBs, readBs, 4x; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// If k is not a multiple of 64 we want to grab the partial amount on the first fetch. +// If it is a multiple of 64 then make a full 64 line fetch. +--:-:-:-:1 LOP.AND.Z P0, partialK, k, 63; +--:-:-:-:1 @P0 MOV partialK, 64; +--:-:-:-:1 IADD k, k, -partialK; +[+ + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY8, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY16, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY24, partialK, P3; + +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P1 LDG.E.CI.64 load0B, [track0B]; +--:-:3:-:1 @P4 LDG.E.CI.64 load1B, [track1B]; +--:-:4:-:1 @P5 LDG.E.CI.64 load2B, [track2B]; +--:-:4:-:1 @P6 LDG.E.CI.64 load3B, [track3B]; + + +--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:-:-:1 @!P1 LDS.U.64 load0B, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.64 load1B, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.64 load2B, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.64 load3B, [addr_zero]; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY32, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY40, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY48, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY56, partialK, P3; + +--:-:5:-:1 @P1 LDG.E.CI.64 load4B, [track4B]; +--:-:5:-:1 @P4 LDG.E.CI.64 load5B, [track5B]; +--:-:6:-:1 @P5 LDG.E.CI.64 load6B, [track6B]; +--:-:6:-:1 @P6 LDG.E.CI.64 load7B, [track7B]; + + +--:-:-:-:1 @!P1 LDS.U.64 load4B, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.64 load5B, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.64 load6B, [addr_zero]; +--:-:1:-:1 @!P6 LDS.U.64 load7B, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY1, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY2, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY3, partialK, P2; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:2:-:1 @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load0A0, RZ; +--:-:-:-:1 @!P4 MOV load0A1, RZ; +--:-:-:-:1 @!P5 MOV load0A2, RZ; +--:-:-:-:1 @!P6 MOV load0A3, RZ; + +--:-:-:-:1 IADD tidAY, tidAY, 4; +--:-:-:-:1 IADD tidAY1, tidAY1, 4; +--:-:-:-:1 IADD tidAY2, tidAY2, 4; +--:-:-:-:1 IADD tidAY3, tidAY3, 4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY1, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY2, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY3, partialK, P2; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @!P3 MOV load0A4, RZ; +--:-:-:-:1 @!P4 MOV load0A5, RZ; +--:-:-:-:1 @!P5 MOV load0A6, RZ; +--:-:-:-:1 @!P6 MOV load0A7, RZ; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load0B0, RZ; +--:-:-:-:1 @!P4 MOV load0B1, RZ; +--:-:-:-:1 @!P5 MOV load0B2, RZ; +--:-:-:-:1 @!P6 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY8, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x78; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load1B0, RZ; +--:-:-:-:1 @!P4 MOV load1B1, RZ; +--:-:-:-:1 @!P5 MOV load1B2, RZ; +--:-:-:-:1 @!P6 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY16, partialK, PT; +--:-:-:-:1 @P2 R2P PR, preds, 0x78; +--:-:-:-:1 @!P2 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load2B0, RZ; +--:-:-:-:1 @!P4 MOV load2B1, RZ; +--:-:-:-:1 @!P5 MOV load2B2, RZ; +--:-:-:-:1 @!P6 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY24, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load3B0, RZ; +--:-:-:-:1 @!P4 MOV load3B1, RZ; +--:-:-:-:1 @!P5 MOV load3B2, RZ; +--:-:-:-:1 @!P6 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY32, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x78; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load4B0, RZ; +--:-:-:-:1 @!P4 MOV load4B1, RZ; +--:-:-:-:1 @!P5 MOV load4B2, RZ; +--:-:-:-:1 @!P6 MOV load4B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY40, partialK, PT; +--:-:-:-:1 @P2 R2P PR, preds, 0x78; +--:-:-:-:1 @!P2 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load5B0, RZ; +--:-:-:-:1 @!P4 MOV load5B1, RZ; +--:-:-:-:1 @!P5 MOV load5B2, RZ; +--:-:-:-:1 @!P6 MOV load5B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY48, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load6B0, RZ; +--:-:-:-:1 @!P4 MOV load6B1, RZ; +--:-:-:-:1 @!P5 MOV load6B2, RZ; +--:-:-:-:1 @!P6 MOV load6B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY56, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x78; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load7B0, RZ; +--:-:-:-:1 @!P4 MOV load7B1, RZ; +--:-:-:-:1 @!P5 MOV load7B2, RZ; +--:-:-:-:1 @!P6 MOV load7B3, RZ; + }; ++] +// partialB = partialK * ldb +--:-:-:-:1 XMAD.LO2 partialB, ldb, partialK, RZ; + +--:-:-:-:1 ISETP.GE.AND P1, PT, k, 64, PT; +--:-:-:-:1 IADD k, k, -64; +--:-:-:-:1 @P1 R2P PR, preds, 0x7c; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c; + + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:1 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A2.H1; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0.H0; + } : q{ +02:-:-:-:1 F2F.F32.F16 load0A7, load0A7; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A6; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A5; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A4; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A3; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0; + }; ++] +--:-:-:-:0 LEA track0A0.CC, partialK, track0A0, 1; +01:-:-:-:1 STS [writeAs + 4x<7*16>], load0A7; +--:-:-:-:1 STS [writeAs + 4x<6*16>], load0A6; +--:-:-:-:1 STS [writeAs + 4x<5*16>], load0A5; +--:-:-:-:1 STS [writeAs + 4x<4*16>], load0A4; +02:-:-:-:1 STS [writeAs + 4x<3*16>], load0A3; +--:-:-:-:1 STS [writeAs + 4x<2*16>], load0A2; +--:-:-:-:1 STS [writeAs + 4x<1*16>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<0*16>], load0A0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +[+ + our $vec; + return $vec ? q{ +04:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:1:-:1 F2F.F32.F16 load0B0, load0B0.H0; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + } : q{ +04:-:-:-:1 F2F.F32.F16 load0B0, load0B0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:1:-:1 F2F.F32.F16 load0B3, load0B3; +--:-:-:-:1 F2F.F32.F16 load1B0, load1B0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:2:-:1 F2F.F32.F16 load1B3, load1B3; + }; ++] +--:-:-:-:0 LEA track0B0.CC, partialB, track0B0, 1; +01:-:-:-:6 STS.128 [writeBs + 4x<0*64>], load0B; +--:-:-:-:1 IADD.X track0B1, track0B1, RZ; + +--:-:-:-:0 LEA track1B0.CC, partialB, track1B0, 1; +02:-:-:-:6 STS.128 [writeBs + 4x<8*64>], load1B; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +[+ + our $vec; + return $vec ? q{ +08:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:1:-:1 F2F.F32.F16 load2B0, load2B0.H0; +--:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:2:-:1 F2F.F32.F16 load3B0, load3B0.H0; + } : q{ +08:-:-:-:1 F2F.F32.F16 load2B0, load2B0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:1:-:1 F2F.F32.F16 load2B3, load2B3; +--:-:-:-:1 F2F.F32.F16 load3B0, load3B0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:2:-:1 F2F.F32.F16 load3B3, load3B3; + }; ++] +--:-:-:-:0 LEA track2B0.CC, partialB, track2B0, 1; +01:-:-:-:6 STS.128 [writeBs + 4x<16*64>], load2B; +--:-:-:-:1 IADD.X track2B1, track2B1, RZ; + +--:-:-:-:0 LEA track3B0.CC, partialB, track3B0, 1; +02:-:-:-:6 STS.128 [writeBs + 4x<24*64>], load3B; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +[+ + our $vec; + return $vec ? q{ +10:-:-:-:1 F2F.F32.F16 load4B3, load4B1.H1; +--:-:-:-:1 F2F.F32.F16 load4B2, load4B1.H0; +--:-:-:-:1 F2F.F32.F16 load4B1, load4B0.H1; +--:-:1:-:1 F2F.F32.F16 load4B0, load4B0.H0; +--:-:-:-:1 F2F.F32.F16 load5B3, load5B1.H1; +--:-:-:-:1 F2F.F32.F16 load5B2, load5B1.H0; +--:-:-:-:1 F2F.F32.F16 load5B1, load5B0.H1; +--:-:2:-:1 F2F.F32.F16 load5B0, load5B0.H0; + } : q{ +10:-:-:-:1 F2F.F32.F16 load4B0, load4B0; +--:-:-:-:1 F2F.F32.F16 load4B1, load4B1; +--:-:-:-:1 F2F.F32.F16 load4B2, load4B2; +--:-:1:-:1 F2F.F32.F16 load4B3, load4B3; +--:-:-:-:1 F2F.F32.F16 load5B0, load5B0; +--:-:-:-:1 F2F.F32.F16 load5B1, load5B1; +--:-:-:-:1 F2F.F32.F16 load5B2, load5B2; +--:-:2:-:1 F2F.F32.F16 load5B3, load5B3; + }; ++] +--:-:-:-:0 LEA track4B0.CC, partialB, track4B0, 1; +01:-:-:-:6 STS.128 [writeBs + 4x<32*64>], load4B; +--:-:-:-:1 IADD.X track4B1, track4B1, RZ; + +--:-:-:-:0 LEA track5B0.CC, partialB, track5B0, 1; +02:-:-:-:6 STS.128 [writeBs + 4x<40*64>], load5B; +--:-:-:-:0 IADD.X track5B1, track5B1, RZ; + +[+ + our $vec; + return $vec ? q{ +20:-:-:-:1 F2F.F32.F16 load6B3, load6B1.H1; +--:-:-:-:1 F2F.F32.F16 load6B2, load6B1.H0; +--:-:-:-:1 F2F.F32.F16 load6B1, load6B0.H1; +--:-:1:-:1 F2F.F32.F16 load6B0, load6B0.H0; +--:-:-:-:1 F2F.F32.F16 load7B3, load7B1.H1; +--:-:-:-:1 F2F.F32.F16 load7B2, load7B1.H0; +--:-:-:-:1 F2F.F32.F16 load7B1, load7B0.H1; +--:-:2:-:1 F2F.F32.F16 load7B0, load7B0.H0; + } : q{ +20:-:-:-:1 F2F.F32.F16 load6B0, load6B0; +--:-:-:-:1 F2F.F32.F16 load6B1, load6B1; +--:-:-:-:1 F2F.F32.F16 load6B2, load6B2; +--:-:1:-:1 F2F.F32.F16 load6B3, load6B3; +--:-:-:-:1 F2F.F32.F16 load7B0, load7B0; +--:-:-:-:1 F2F.F32.F16 load7B1, load7B1; +--:-:-:-:1 F2F.F32.F16 load7B2, load7B2; +--:-:2:-:1 F2F.F32.F16 load7B3, load7B3; + }; ++] +--:-:-:-:0 LEA track6B0.CC, partialB, track6B0, 1; +01:-:-:-:6 STS.128 [writeBs + 4x<48*64>], load6B; +--:-:-:-:1 IADD.X track6B1, track6B1, RZ; + +--:-:-:-:0 LEA track7B0.CC, partialB, track7B0, 1; +02:-:-:-:6 STS.128 [writeBs + 4x<56*64>], load7B; +--:-:-:-:0 IADD.X track7B1, track7B1, RZ; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*16 + 00>]; +--:-:-:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*16 + 08>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +[+ + our $vec; + return $vec ? q{ +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.64 load0B, [track0B]; +--:-:3:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load2B, [track2B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load3B, [track3B]; +--:-:5:-:1 @P3 LDG.E.CI.64 load4B, [track4B]; +--:-:5:-:1 @P3 LDG.E.CI.64 load5B, [track5B]; +--:-:6:-:1 @P3 LDG.E.CI.64 load6B, [track6B]; +--:-:6:-:1 @P3 LDG.E.CI.64 load7B, [track7B]; + } : q{ +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>]; + }; ++] + +LOOP: + +[+ + our $vec; + our %insert = + ( + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, RZ, PT;\n", + j0c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, k, 64, PT;\n" . + "--:-:-:-:1 IADD k, k, -64;\n", + + j0c23 => "--:-:-:-:1 \@P1 R2P PR, preds, 0x7c;\n", + j0c24 => "--:-:-:-:1 \@!P1 R2P PR, RZ, 0x7c;\n", + + j2c32 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<64>;\n", + j2c37 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + + j3c25 => "--:-:-:-:1 \@P3 IADD track0B0.CC, track0B0, ldb64;\n", + j3c30 => "--:-:-:-:1 \@P3 IADD.X track0B1, track0B1, RZ;\n", + j3c32 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, ldb64;\n", + j3c37 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + + j4c25 => "--:-:-:-:1 \@P3 IADD track2B0.CC, track2B0, ldb64;\n", + j4c30 => "--:-:-:-:1 \@P3 IADD.X track2B1, track2B1, RZ;\n", + j4c32 => "--:-:-:-:1 \@P3 IADD track3B0.CC, track3B0, ldb64;\n", + j4c37 => "--:-:-:-:1 \@P3 IADD.X track3B1, track3B1, RZ;\n", + + j5c25 => "--:-:-:-:1 \@P3 IADD track4B0.CC, track4B0, ldb64;\n", + j5c30 => "--:-:-:-:1 \@P3 IADD.X track4B1, track4B1, RZ;\n", + j5c32 => "--:-:-:-:1 \@P3 IADD track5B0.CC, track5B0, ldb64;\n", + j5c37 => "--:-:-:-:1 \@P3 IADD.X track5B1, track5B1, RZ;\n", + + j6c25 => "--:-:-:-:1 \@P3 IADD track6B0.CC, track6B0, ldb64;\n", + j6c30 => "--:-:-:-:1 \@P3 IADD.X track6B1, track6B1, RZ;\n", + j6c32 => "--:-:-:-:1 \@P3 IADD track7B0.CC, track7B0, ldb64;\n", + j6c37 => "--:-:-:-:1 \@P3 IADD.X track7B1, track7B1, RZ;\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j2c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<7*16>], load0A7;\n", + j2c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*16>], load0A6;\n", + j2c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*16>], load0A5;\n", + j2c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*16>], load0A4;\n", + j2c24 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*16>], load0A3;\n", + j2c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*16>], load0A2;\n", + j2c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*16>], load0A1;\n", + j2c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<0*16>], load0A0;\n", + + j3c16 => "04:-:-:-:1 \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n", + j3c20 => "--:3:-:-:1 \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n", + + j4c16 => "08:-:-:-:1 \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n", + j4c20 => "--:4:-:-:1 \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n", + + j5c16 => "10:-:-:-:1 \@P0 STS.128 [writeBs + 4x<32*64>], load4B;\n", + j5c20 => "--:5:-:-:1 \@P0 STS.128 [writeBs + 4x<40*64>], load5B;\n", + + j6c16 => "20:-:-:-:1 \@P0 STS.128 [writeBs + 4x<48*64>], load6B;\n", + j6c20 => "--:6:-:-:1 \@P0 STS.128 [writeBs + 4x<56*64>], load7B;\n", + + ($vec ? + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n", + j1c63 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j2c36 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n", + j2c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j3c36 => "08:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n", + j3c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n", + j3c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n", + j3c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n", + j3c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j4c36 => "10:-:-:-:1 \@P0 F2F.F32.F16 load4B3, load4B1.H1;\n", + j4c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B2, load4B1.H0;\n", + j4c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B1, load4B0.H1;\n", + j4c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B0, load4B0.H0;\n", + j4c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B3, load5B1.H1;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B2, load5B1.H0;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B1, load5B0.H1;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load5B0, load5B0.H0;\n", + + j5c36 => "20:-:-:-:1 \@P0 F2F.F32.F16 load6B3, load6B1.H1;\n", + j5c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B2, load6B1.H0;\n", + j5c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B1, load6B0.H1;\n", + j5c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B0, load6B0.H0;\n", + j5c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B3, load7B1.H1;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B2, load7B1.H0;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B1, load7B0.H1;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load7B0, load7B0.H0;\n", + + j2c61 => "02:-:2:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j3c60 => "04:-:-:-:1 \@P3 LDG.E.CI.64 load0B, [track0B];\n", + j3c62 => "--:-:3:-:1 \@P3 LDG.E.CI.64 load1B, [track1B];\n", + j4c60 => "08:-:-:-:1 \@P3 LDG.E.CI.64 load2B, [track2B];\n", + j4c62 => "--:-:4:-:1 \@P3 LDG.E.CI.64 load3B, [track3B];\n", + j5c60 => "10:-:-:-:1 \@P3 LDG.E.CI.64 load4B, [track4B];\n", + j5c62 => "--:-:5:-:1 \@P3 LDG.E.CI.64 load5B, [track5B];\n", + j6c60 => "20:-:-:-:1 \@P3 LDG.E.CI.64 load6B, [track6B];\n", + j6c62 => "--:-:6:-:1 \@P3 LDG.E.CI.64 load7B, [track7B];\n", + ) : + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A0, load0A0;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A1;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A2;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A3;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A4;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A5;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A6;\n", + j1c63 => "--:2:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A7;\n", + + j2c36 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0;\n", + j2c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B1;\n", + j2c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B2;\n", + j2c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B3;\n", + j2c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B0, load1B0;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B1;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B2;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load1B3, load1B3;\n", + + j3c36 => "08:-:-:-:1 \@P0 F2F.F32.F16 load2B0, load2B0;\n", + j3c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B1;\n", + j3c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B2;\n", + j3c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B3;\n", + j3c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B0, load3B0;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B1;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B2;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load3B3, load3B3;\n", + + j4c36 => "10:-:-:-:1 \@P0 F2F.F32.F16 load4B0, load4B0;\n", + j4c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B1, load4B1;\n", + j4c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B2, load4B2;\n", + j4c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load4B3, load4B3;\n", + j4c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B0, load5B0;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B1, load5B1;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load5B2, load5B2;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load5B3, load5B3;\n", + + j5c36 => "20:-:-:-:1 \@P0 F2F.F32.F16 load6B0, load6B0;\n", + j5c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B1, load6B1;\n", + j5c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B2, load6B2;\n", + j5c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load6B3, load6B3;\n", + j5c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B0, load7B0;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B1, load7B1;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load7B2, load7B2;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load7B3, load7B3;\n", + + j2c48 => "02:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j2c50 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j2c52 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j2c54 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + j2c56 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n", + j2c58 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n", + j2c60 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n", + j2c62 => "--:-:2:-:1 \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n", + + j3c48 => "04:-:-:-:1 \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j3c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j3c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + j3c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j3c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j3c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j3c62 => "--:-:3:-:1 \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + + j4c48 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j4c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j4c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j4c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + j4c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j4c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j4c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j4c62 => "--:-:4:-:1 \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + + j5c48 => "10:-:-:-:1 \@P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>];\n", + j5c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>];\n", + j5c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>];\n", + j5c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>];\n", + j5c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>];\n", + j5c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>];\n", + j5c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>];\n", + j5c62 => "--:-:5:-:1 \@P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>];\n", + + j6c48 => "20:-:-:-:1 \@P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>];\n", + j6c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>];\n", + j6c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>];\n", + j6c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>];\n", + j6c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>];\n", + j6c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>];\n", + j6c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>];\n", + j6c62 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>];\n", + ) + ), + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out = ''; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*16 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*16 + 08>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHR.U32 tid16, tid, 4; +--:-:-:-:1 SHL tid15, tid15, 2; +--:-:-:-:1 ISCADD readCs, tid16, tid15, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*64 + tid15; +--:-:-:-:1 ISCADD cx, blkB, tid15, 6; +--:-:-:-:1 IADD cx1, cx, 1; +--:-:-:-:1 IADD cx2, cx, 2; +--:-:-:-:1 IADD cx3, cx, 3; + +// cy = blkA*16 + tid16 +--:-:-:-:1 ISCADD cy, blkA, tid16, 4; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 SHL ldc8, ldc, 4; + +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C1, ci, param_C[1], RZ, 1; + +// P0 = cx < n +--:-:-:-:1 ISETP.LT.AND P0, PT, cx, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, cx1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, cx2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, cx3, param_n, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +// P4 = cy < m +--:-:-:-:1 ISETP.LT.AND P4, PT, cy, param_m, PT; + +// P5 = beta != 0 && P4 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P4; + +// P6 = Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; + +// Init beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:5 EXIT; + +STORE_C: + +[+ + our $vec; + return $vec ? q{ +--:-:1:-:1 @P0 LDG.E.64 loadC, [C]; + } : q{ +--:-:-:-:0 @!P0 MOV loadC0, RZ; +--:-:-:-:1 @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>]; +--:-:-:-:0 @!P1 MOV loadC1, RZ; +--:-:-:-:1 @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>]; +--:-:-:-:0 @!P2 MOV loadC2, RZ; +--:-:-:-:1 @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>]; +--:-:-:-:0 @!P3 MOV loadC3, RZ; +--:-:1:-:1 @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>]; + }; ++] + +// Restore output preds +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 LDS.U.128 part0C, [readCs + 4x<0*8*64>]; +--:-:2:-:1 LDS.U.128 part1C, [readCs + 4x<1*8*64>]; +--:-:-:-:1 LDS.U.128 part2C, [readCs + 4x<2*8*64>]; +--:-:3:-:1 LDS.U.128 part3C, [readCs + 4x<3*8*64>]; +--:-:-:-:1 LDS.U.128 part4C, [readCs + 4x<4*8*64>]; +--:-:4:-:1 LDS.U.128 part5C, [readCs + 4x<5*8*64>]; +--:-:-:-:1 LDS.U.128 part6C, [readCs + 4x<6*8*64>]; +--:-:5:-:1 LDS.U.128 part7C, [readCs + 4x<7*8*64>]; + + +02:-:-:-:1 @P0 FADD part0C0, part0C0, part1C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part1C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part1C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part1C3; + +04:-:-:-:1 @P0 FADD part2C0, part2C0, part3C0; +--:-:-:-:1 @P1 FADD part2C1, part2C1, part3C1; +--:-:-:-:1 @P2 FADD part2C2, part2C2, part3C2; +--:-:-:-:1 @P3 FADD part2C3, part2C3, part3C3; + +08:-:-:-:1 @P0 FADD part4C0, part4C0, part5C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part5C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part5C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part5C3; + +10:-:-:-:1 @P0 FADD part6C0, part6C0, part7C0; +--:-:-:-:1 @P1 FADD part6C1, part6C1, part7C1; +--:-:-:-:1 @P2 FADD part6C2, part6C2, part7C2; +--:-:-:-:1 @P3 FADD part6C3, part6C3, part7C3; + +--:-:-:-:1 @P0 FADD part0C0, part0C0, part2C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part2C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part2C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part2C3; + +--:-:-:-:1 @P0 FADD part4C0, part4C0, part6C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part6C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part6C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part6C3; + +--:-:-:-:1 @P0 FADD c0, part0C0, part4C0; +--:-:-:-:1 @P1 FADD c1, part0C1, part4C1; +--:-:-:-:1 @P2 FADD c2, part0C2, part4C2; +--:-:-:-:1 @P3 FADD c3, part0C3, part4C3; + + +--:-:-:-:0 IADD cy, cy, 8; + +[+ + our $vec; + return $vec ? q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0.H0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC0.H1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC1.H0; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC1.H1; + } : q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC2; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC3; + }; ++] + +01:-:-:-:1 @P5 FFMA c0, b0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, b1, beta, c1; +04:-:-:-:1 @P5 FFMA c2, b2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, b3, beta, c3; + +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:0 ISETP.LT.AND P5, PT, cy, param_m, P5; + +--:-:1:-:1 @P0 F2F.F16.F32 c0, c0; +--:-:2:-:1 @P1 F2F.F16.F32 c1, c1; + +--:-:-:-:0 ISETP.LT.AND P4, PT, cy, param_m, PT; + +--:-:3:-:1 @P2 F2F.F16.F32 c2, c2; +--:-:4:-:1 @P3 F2F.F16.F32 c3, c3; + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:2 @P0 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 @P0 BFI c1, c3, 0x1010, c2; + +--:1:-:-:1 @P0 STG.E.CG.64 [C], c; + } : q{ +01:-:-:-:1 @P0 STG.E.U16 [C + 2x<0>], c0; +02:-:-:-:1 @P1 STG.E.U16 [C + 2x<1>], c1; +04:-:-:-:1 @P2 STG.E.U16 [C + 2x<2>], c2; +08:1:-:-:1 @P3 STG.E.U16 [C + 2x<3>], c3; + }; ++] + +// Restore beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +01:-:-:-:6 IADD C0.CC, C0, ldc8; +--:-:-:-:0 IADD.X C1, C1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_32x128.sass b/Kernel/SGEMM/Pascal/hgemm_nn_32x128.sass new file mode 100644 index 0000000..8c4510d --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nn_32x128.sass @@ -0,0 +1,562 @@ +# Kernel: hgemm_nn_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + (32*16 + 32)*2> + szShareA : (32*16 + 32) + szShareB : (128*16) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadA<0-3> + 84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + + 100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; +--:-:3:-:1 S2R blkA, SR_CTAID.Y; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb4, ldb, 2; +--:-:-:-:1 SHL ldb16, ldb, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) +01:-:-:-:1 LOP.AND tidBX, tid, 31; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 5; + +// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2 +04:-:-:-:1 ISCADD txa, blkA, tidAX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 1; + +// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4 +02:-:-:-:1 ISCADD txb, blkB, tidBX, 7; +--:-:-:-:1 XMAD.LO2 tb0, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb0, ldbz, blkZ, tb0; +--:-:-:-:1 IADD tb1, tb0, ldb4; +--:-:-:-:1 IADD tb2, tb1, ldb4; +--:-:-:-:1 IADD tb3, tb2, ldb4; + +--:-:-:-:1 LEA track0B0.CC, tb0, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb0, param_B[1], RZ, 1; +--:-:-:-:1 LEA track1B0.CC, tb1, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb1, param_B[1], RZ, 1; +--:-:-:-:1 LEA track2B0.CC, tb2, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb2, param_B[1], RZ, 1; +--:-:-:-:1 LEA track3B0.CC, tb3, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb3, param_B[1], RZ, 1; + +// writeAs = (tidAY*32 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*128 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 7; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readBs, readBs, tid96; +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidBY1, tidBY, 4; +--:-:-:-:1 IADD tidBY2, tidBY, 8; +--:-:-:-:1 IADD tidBY3, tidBY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txb, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidBY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.CI.64 load0B, [track0B]; +--:-:2:-:1 @P1 LDG.E.CI.64 load1B, [track1B]; +--:-:3:-:1 @P2 LDG.E.CI.64 load2B, [track2B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load3B, [track3B]; +--:-:5:-:1 @P4 LDG.E.CI.64 loadA, [trackA]; + + + +--:-:6:-:1 @!P0 LDS.U.64 load0B, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.64 load1B, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.64 load2B, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 load3B, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.64 loadA, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:1:-:1 @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:1:-:1 @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:1:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:1:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P5; + +--:-:2:-:1 @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:3:-:1 @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:4:-:1 @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P6; + +--:-:5:-:1 @P0 LDG.E.CI.U16 loadA0, [trackA + 2x<0>]; +--:-:5:-:1 @P1 LDG.E.CI.U16 loadA1, [trackA + 2x<1>]; +--:-:5:-:1 @P2 LDG.E.CI.U16 loadA2, [trackA + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 loadA3, [trackA + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P0, RZ, k, 15; +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 16, P0; + + + + + our $vec; + return $vec ? q{ +21:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:1:-:1 F2F.F32.F16 load0B0, load0B0.H0; + +02:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + +04:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:3:-:1 F2F.F32.F16 load2B0, load2B0.H0; + +08:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:4:-:1 F2F.F32.F16 load3B0, load3B0.H0; + +10:-:-:-:1 F2F.F32.F16 loadA3, loadA1.H1; +--:-:-:-:1 F2F.F32.F16 loadA2, loadA1.H0; +--:-:-:-:1 F2F.F32.F16 loadA1, loadA0.H1; +--:-:5:-:1 F2F.F32.F16 loadA0, loadA0.H0; + } : q{ +21:-:-:-:1 F2F.F32.F16 load0B0, load0B0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:1:-:1 F2F.F32.F16 load0B3, load0B3; + +02:-:-:-:1 F2F.F32.F16 load1B0, load1B0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:2:-:1 F2F.F32.F16 load1B3, load1B3; + +04:-:-:-:1 F2F.F32.F16 load2B0, load2B0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:3:-:1 F2F.F32.F16 load2B3, load2B3; + +08:-:-:-:1 F2F.F32.F16 load3B0, load3B0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:4:-:1 F2F.F32.F16 load3B3, load3B3; + +10:-:-:-:1 F2F.F32.F16 loadA0, loadA0; +--:-:-:-:1 F2F.F32.F16 loadA1, loadA1; +--:-:-:-:1 F2F.F32.F16 loadA2, loadA2; +--:-:5:-:1 F2F.F32.F16 loadA3, loadA3; + }; + + +01:-:-:-:1 STS.128 [writeBs + 4x<0*128>], load0B; +--:-:-:-:6 IADD track0B0.CC, track0B0, ldb16; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +02:-:-:-:1 STS.128 [writeBs + 4x<4*128>], load1B; +--:-:-:-:6 IADD track1B0.CC, track1B0, ldb16; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +04:-:-:-:1 STS.128 [writeBs + 4x<8*128>], load2B; +--:-:-:-:6 IADD track2B0.CC, track2B0, ldb16; +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +08:-:-:-:1 STS.128 [writeBs + 4x<12*128>], load3B; +--:-:-:-:6 IADD track3B0.CC, track3B0, ldb16; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +10:-:-:-:1 STS [writeAs + 4x<0*32>], loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*32>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*32>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*32>], loadA3; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.64 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.64 load2B, [track2B]; +--:-:5:-:1 @P5 LDG.E.CI.64 load3B, [track3B]; +--:-:6:-:1 @P6 LDG.E.CI.64 loadA, [trackA]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n", + j9c6 => "10:5:-:-:1 \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n", + j11c6 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<3*32>], loadA3;\n", + j11c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], loadA2;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], loadA1;\n", + j11c12 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<0*32>], loadA0;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0B0.CC, track0B0, ldb16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0B1, track0B1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, ldb16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2B0.CC, track2B0, ldb16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2B1, track2B1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3B0.CC, track3B0, ldb16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3B1, track3B1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackA0.CC, trackA0, 2x<16>;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackA1, trackA1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.64 load0B, [track0B];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1B, [track1B];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.64 load2B, [track2B];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.64 load3B, [track3B];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.64 loadA, [trackA];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1B3, load1B1.H1;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B2, load1B1.H0;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B1, load1B0.H1;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j6c13 => "10:-:-:-:1 \@P5 F2F.F32.F16 load2B3, load2B1.H1;\n", + j6c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2B2, load2B1.H0;\n", + j6c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2B1, load2B0.H1;\n", + j6c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load2B0, load2B0.H0;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B3, load3B1.H1;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B2, load3B1.H0;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B1, load3B0.H1;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadA3, loadA1.H1;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA2, loadA1.H0;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA1, loadA0.H1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA0, loadA0.H0;\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0B0, load0B0;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B1, load0B1;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B2, load0B2;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0B3, load0B3;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1B0, load1B0;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B1, load1B1;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B2, load1B2;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1B3, load1B3;\n", + + j6c13 => "10:-:-:-:1 \@P5 F2F.F32.F16 load2B0, load2B0;\n", + j6c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2B1, load2B1;\n", + j6c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2B2, load2B2;\n", + j6c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load2B3, load2B3;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B0, load3B0;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B1, load3B1;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B2, load3B2;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3B3, load3B3;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadA3, loadA3;\n", + j10c17 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA2, loadA2;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA1, loadA1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA0, loadA0;\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_32x64.sass b/Kernel/SGEMM/Pascal/hgemm_nn_32x64.sass new file mode 100644 index 0000000..56b813f --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nn_32x64.sass @@ -0,0 +1,913 @@ +# Kernel: hgemm_nn_32x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<32*33*2 + 64*32*2> + szShareA : (32*33) + szShareB : (64*32) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24>, tid1, tid32, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta, xmad_tb + + 96-119 : load0A<0-7>, load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + 120-129 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 130-137 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb32 + 138-144 ~ tid, blkA, blkB, blkZ, writeCs, preds + + 0-15 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3> + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + 96-99 : loadC<0-3> + 100-103 : b<0-3> + 104-107 : c<0-3> + 108-109 : C<0-1> + 110-137 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15, tid16 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb8, ldb, 3; +--:-:-:-:1 SHL ldb32, ldb, 6; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 3 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +--:-:-:-:1 LOP.AND tidAY, tid, 3; +--:-:-:-:1 SHL shiftAX, tidAY, 3; +--:-:-:-:1 SHL tidAY, tidAY, 3; + +// tidBX = (tid & 15) << 2 +// tidBY = tid >> 4 +01:-:-:-:1 LOP.AND tidBX, tid, 15; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 4; + +--:-:-:-:1 IADD tidBY8, tidBY, 8; +--:-:-:-:1 IADD tidBY16, tidBY, 16; +--:-:-:-:1 IADD tidBY24, tidBY, 24; + +// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa, param_m, PT; + +// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA track0B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track1B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track2B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb, param_B[1], RZ, 1; +--:-:-:-:1 IADD tb, tb, ldb8; +--:-:-:-:1 LEA track3B0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb, param_B[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P3, PT, txb, param_n, PT; +[+ + our $vec; + return $vec ? '' : q{ +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb3, param_n, PT; + }; ++] +--:-:-:-:1 P2R preds, PR, RZ, 0x7c; + +// writeAs = (tidAY*32 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (tidBY*64 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 2) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 2 bits at position 1 +--:-:-:-:1 SHL readBs, readBs, 4; + +// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5) +// tid32 = tid & -32 +--:-:-:-:1 LOP.AND tid32, tid, -32; + +// Write out the 4 groups of 32 rows 16 at a time +// writeCs = (readAs + tid32/2*4) * 64 + readBs +--:-:-:-:1 ISCADD writeCs, tid32, readAs, 1; +--:-:-:-:1 ISCADD writeCs, writeCs, readBs, 6; + +// Each block of 32 threads works on 8 lines, +// readAs is also shifted over by 8 for each group of 32 threads +// readAs += tid32/4 * 32 * 4 + tid32/4 * 4 +// readBs += tid32/4 * 64 * 4 + 4x +--:-:-:-:1 ISCADD readAs, tid32, readAs, 5; +--:-:-:-:1 ISCADD readBs, tid32, readBs, 6; +--:-:-:-:1 IADD readAs, tid32, readAs; +--:-:-:-:1 IADD readBs, readBs, 4x; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// If k is not a multiple of 32 we want to grab the partial amount on the first fetch. +// If it is a multiple of 32 then make a full 32 line fetch. +--:-:-:-:1 LOP.AND.Z P0, partialK, k, 31; +--:-:-:-:1 @P0 MOV partialK, 32; +--:-:-:-:1 IADD k, k, -partialK; +[+ + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY8, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY16, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY24, partialK, P3; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidBY, partialK, P3; + +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.64 load0B, [track0B]; +--:-:4:-:1 @P4 LDG.E.CI.64 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.64 load2B, [track2B]; +--:-:6:-:1 @P6 LDG.E.CI.64 load3B, [track3B]; + + +--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.64 load0B, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.64 load1B, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.64 load2B, [addr_zero]; +--:-:1:-:1 @!P6 LDS.U.64 load3B, [addr_zero]; + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY1, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY2, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY3, partialK, P2; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:2:-:1 @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load0A0, RZ; +--:-:-:-:1 @!P4 MOV load0A1, RZ; +--:-:-:-:1 @!P5 MOV load0A2, RZ; +--:-:-:-:1 @!P6 MOV load0A3, RZ; + +--:-:-:-:1 IADD tidAY, tidAY, 4; +--:-:-:-:1 IADD tidAY1, tidAY1, 4; +--:-:-:-:1 IADD tidAY2, tidAY2, 4; +--:-:-:-:1 IADD tidAY3, tidAY3, 4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY1, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY2, partialK, P2; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY3, partialK, P2; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @!P3 MOV load0A4, RZ; +--:-:-:-:1 @!P4 MOV load0A5, RZ; +--:-:-:-:1 @!P5 MOV load0A6, RZ; +--:-:-:-:1 @!P6 MOV load0A7, RZ; + + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load0B0, RZ; +--:-:-:-:1 @!P4 MOV load0B1, RZ; +--:-:-:-:1 @!P5 MOV load0B2, RZ; +--:-:-:-:1 @!P6 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY8, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x78; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load1B0, RZ; +--:-:-:-:1 @!P4 MOV load1B1, RZ; +--:-:-:-:1 @!P5 MOV load1B2, RZ; +--:-:-:-:1 @!P6 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY16, partialK, PT; +--:-:-:-:1 @P2 R2P PR, preds, 0x78; +--:-:-:-:1 @!P2 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load2B0, RZ; +--:-:-:-:1 @!P4 MOV load2B1, RZ; +--:-:-:-:1 @!P5 MOV load2B2, RZ; +--:-:-:-:1 @!P6 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY24, partialK, PT; +--:-:-:-:1 @P0 R2P PR, preds, 0x78; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x78; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P3 MOV load3B0, RZ; +--:-:-:-:1 @!P4 MOV load3B1, RZ; +--:-:-:-:1 @!P5 MOV load3B2, RZ; +--:-:-:-:1 @!P6 MOV load3B3, RZ; + + }; ++] +// partialB = partialK * ldb +--:-:-:-:1 XMAD.LO2 partialB, ldb, partialK, RZ; + +--:-:-:-:1 ISETP.GE.AND P1, PT, k, 32, PT; +--:-:-:-:1 IADD k, k, -32; +--:-:-:-:1 @P1 R2P PR, preds, 0x7c; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c; + + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:1 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A2.H1; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0.H0; + } : q{ +02:-:-:-:1 F2F.F32.F16 load0A7, load0A7; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A6; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A5; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A4; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A3; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0; + }; ++] +--:-:-:-:0 LEA track0A0.CC, partialK, track0A0, 1; +01:-:-:-:1 STS [writeAs + 4x<7*32>], load0A7; +--:-:-:-:1 STS [writeAs + 4x<6*32>], load0A6; +--:-:-:-:1 STS [writeAs + 4x<5*32>], load0A5; +--:-:-:-:1 STS [writeAs + 4x<4*32>], load0A4; +02:-:-:-:1 STS [writeAs + 4x<3*32>], load0A3; +--:-:-:-:1 STS [writeAs + 4x<2*32>], load0A2; +--:-:-:-:1 STS [writeAs + 4x<1*32>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<0*32>], load0A0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +[+ + our $vec; + return $vec ? q{ +04:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:3:-:1 F2F.F32.F16 load0B0, load0B0.H0; + +08:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:4:-:1 F2F.F32.F16 load1B0, load1B0.H0; + +10:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:5:-:1 F2F.F32.F16 load2B0, load2B0.H0; + +20:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:6:-:1 F2F.F32.F16 load3B0, load3B0.H0; + } : q{ +04:-:-:-:1 F2F.F32.F16 load0B0, load0B0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:3:-:1 F2F.F32.F16 load0B3, load0B3; + +08:-:-:-:1 F2F.F32.F16 load1B0, load1B0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:4:-:1 F2F.F32.F16 load1B3, load1B3; + +10:-:-:-:1 F2F.F32.F16 load2B0, load2B0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:5:-:1 F2F.F32.F16 load2B3, load2B3; + +20:-:-:-:1 F2F.F32.F16 load3B0, load3B0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:6:-:1 F2F.F32.F16 load3B3, load3B3; + }; ++] + +--:-:-:-:0 LEA track0B0.CC, partialB, track0B0, 1; +04:-:-:-:6 STS.128 [writeBs + 4x<0*64>], load0B; +--:-:-:-:1 IADD.X track0B1, track0B1, RZ; + +--:-:-:-:0 LEA track1B0.CC, partialB, track1B0, 1; +08:-:-:-:6 STS.128 [writeBs + 4x<8*64>], load1B; +--:-:-:-:1 IADD.X track1B1, track1B1, RZ; + +--:-:-:-:0 LEA track2B0.CC, partialB, track2B0, 1; +10:-:-:-:6 STS.128 [writeBs + 4x<16*64>], load2B; +--:-:-:-:1 IADD.X track2B1, track2B1, RZ; + +--:-:-:-:0 LEA track3B0.CC, partialB, track3B0, 1; +20:-:-:-:6 STS.128 [writeBs + 4x<24*64>], load3B; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +[+ + our $vec; + return $vec ? q{ +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.64 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:5:-:1 @P3 LDG.E.CI.64 load2B, [track2B]; +--:-:6:-:1 @P3 LDG.E.CI.64 load3B, [track3B]; + } : q{ +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + }; ++] + +LOOP: + +[+ + our $vec; + our %insert = + ( + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, RZ, PT;\n", + j0c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, k, 32, PT;\n" . + "--:-:-:-:1 IADD k, k, -32;\n", + + j0c23 => "--:-:-:-:1 \@P1 R2P PR, preds, 0x7c;\n", + j0c24 => "--:-:-:-:1 \@!P1 R2P PR, RZ, 0x7c;\n", + + j2c32 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<32>;\n", + j2c37 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j3c32 => "--:-:-:-:1 \@P3 IADD track0B0.CC, track0B0, ldb32;\n", + j3c37 => "--:-:-:-:1 \@P3 IADD.X track0B1, track0B1, RZ;\n", + j4c32 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, ldb32;\n", + j4c37 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j5c32 => "--:-:-:-:1 \@P3 IADD track2B0.CC, track2B0, ldb32;\n", + j5c37 => "--:-:-:-:1 \@P3 IADD.X track2B1, track2B1, RZ;\n", + j6c32 => "--:-:-:-:1 \@P3 IADD track3B0.CC, track3B0, ldb32;\n", + j6c37 => "--:-:-:-:1 \@P3 IADD.X track3B1, track3B1, RZ;\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j2c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<7*32>], load0A7;\n", + j2c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32>], load0A6;\n", + j2c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32>], load0A5;\n", + j2c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*32>], load0A4;\n", + j2c24 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*32>], load0A3;\n", + j2c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], load0A2;\n", + j2c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], load0A1;\n", + j2c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<0*32>], load0A0;\n", + + j3c16 => "04:3:-:-:1 \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n", + j4c16 => "08:4:-:-:1 \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n", + j5c16 => "10:5:-:-:1 \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n", + j6c16 => "20:6:-:-:1 \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n", + + ($vec ? + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n", + j1c63 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j2c51 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j3c51 => "08:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j4c51 => "10:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n", + + j5c51 => "20:-:-:-:1 \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j2c61 => "02:-:2:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j3c61 => "04:-:3:-:1 \@P3 LDG.E.CI.64 load0B, [track0B];\n", + j4c61 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1B, [track1B];\n", + j5c61 => "10:-:5:-:1 \@P3 LDG.E.CI.64 load2B, [track2B];\n", + j6c61 => "20:-:6:-:1 \@P3 LDG.E.CI.64 load3B, [track3B];\n", + ) : + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A0, load0A0;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A1;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A2;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A3;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A4;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A5;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A6;\n", + j1c63 => "--:2:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A7;\n", + + j2c51 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B1;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B2;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load0B3, load0B3;\n", + + j3c51 => "08:-:-:-:1 \@P0 F2F.F32.F16 load1B0, load1B0;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B1;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B2;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load1B3, load1B3;\n", + + j4c51 => "10:-:-:-:1 \@P0 F2F.F32.F16 load2B0, load2B0;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B1;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B2;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load2B3, load2B3;\n", + + j5c51 => "20:-:-:-:1 \@P0 F2F.F32.F16 load3B0, load3B0;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B1;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B2;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load3B3, load3B3;\n", + + j2c48 => "02:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j2c50 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j2c52 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j2c54 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + j2c56 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n", + j2c58 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n", + j2c60 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n", + j2c62 => "--:-:2:-:1 \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n", + + j3c56 => "04:-:-:-:1 \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j3c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j3c62 => "--:-:3:-:1 \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + + j4c56 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j4c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j4c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j4c62 => "--:-:4:-:1 \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + + j5c56 => "10:-:-:-:1 \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j5c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j5c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j5c62 => "--:-:5:-:1 \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + + j6c56 => "20:-:-:-:1 \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j6c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j6c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j6c62 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + ) + ), + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out = ''; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHR.U32 tid16, tid, 4; +--:-:-:-:1 SHL tid15, tid15, 2; +--:-:-:-:1 ISCADD readCs, tid16, tid15, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*64 + tid15; +--:-:-:-:1 ISCADD cx, blkB, tid15, 6; +--:-:-:-:1 IADD cx1, cx, 1; +--:-:-:-:1 IADD cx2, cx, 2; +--:-:-:-:1 IADD cx3, cx, 3; + +// cy = blkA*32 + tid16 +--:-:-:-:1 ISCADD cy, blkA, tid16, 5; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 SHL ldc8, ldc, 4; + +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C1, ci, param_C[1], RZ, 1; + +// P0 = cx < n +--:-:-:-:1 ISETP.LT.AND P0, PT, cx, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, cx1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, cx2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, cx3, param_n, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +// P4 = cy < m +--:-:-:-:1 ISETP.LT.AND P4, PT, cy, param_m, PT; + +// P5 = beta != 0 && P4 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P4; + +// P6 = Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; + +// Init beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:5 EXIT; + +STORE_C: + +[+ + our $vec; + return $vec ? q{ +--:-:1:-:1 @P0 LDG.E.64 loadC, [C]; + } : q{ +--:-:-:-:0 @!P0 MOV loadC0, RZ; +--:-:-:-:1 @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>]; +--:-:-:-:0 @!P1 MOV loadC1, RZ; +--:-:-:-:1 @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>]; +--:-:-:-:0 @!P2 MOV loadC2, RZ; +--:-:-:-:1 @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>]; +--:-:-:-:0 @!P3 MOV loadC3, RZ; +--:-:1:-:1 @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>]; + }; ++] + +// Restore output preds +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 LDS.U.128 part0C, [readCs + 4x< 0*64>]; +--:-:2:-:1 LDS.U.128 part1C, [readCs + 4x<16*64>]; +--:-:-:-:1 LDS.U.128 part2C, [readCs + 4x<32*64>]; +--:-:3:-:1 LDS.U.128 part3C, [readCs + 4x<48*64>]; + + +02:-:-:-:1 @P0 FADD part0C0, part0C0, part1C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part1C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part1C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part1C3; + +04:-:-:-:1 @P0 FADD part2C0, part2C0, part3C0; +--:-:-:-:1 @P1 FADD part2C1, part2C1, part3C1; +--:-:-:-:1 @P2 FADD part2C2, part2C2, part3C2; +--:-:-:-:1 @P3 FADD part2C3, part2C3, part3C3; + +--:-:-:-:1 @P0 FADD c0, part0C0, part2C0; +--:-:-:-:1 @P1 FADD c1, part0C1, part2C1; +--:-:-:-:1 @P2 FADD c2, part0C2, part2C2; +--:-:-:-:1 @P3 FADD c3, part0C3, part2C3; + + +--:-:-:-:0 IADD cy, cy, 8; + +[+ + our $vec; + return $vec ? q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0.H0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC0.H1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC1.H0; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC1.H1; + } : q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC2; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC3; + }; ++] + +01:-:-:-:1 @P5 FFMA c0, b0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, b1, beta, c1; +04:-:-:-:1 @P5 FFMA c2, b2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, b3, beta, c3; + +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:0 ISETP.LT.AND P5, PT, cy, param_m, P5; + +--:-:1:-:1 @P0 F2F.F16.F32 c0, c0; +--:-:2:-:1 @P1 F2F.F16.F32 c1, c1; + +--:-:-:-:0 ISETP.LT.AND P4, PT, cy, param_m, PT; + +--:-:3:-:1 @P2 F2F.F16.F32 c2, c2; + +--:-:-:-:0 LOP.XOR readCs, readCs, 4x<8*64>; + +--:-:4:-:1 @P3 F2F.F16.F32 c3, c3; + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:2 @P0 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 @P0 BFI c1, c3, 0x1010, c2; + +--:1:-:-:1 @P0 STG.E.CG.64 [C], c; + } : q{ +01:-:-:-:1 @P0 STG.E.U16 [C + 2x<0>], c0; +02:-:-:-:1 @P1 STG.E.U16 [C + 2x<1>], c1; +04:-:-:-:1 @P2 STG.E.U16 [C + 2x<2>], c2; +08:1:-:-:1 @P3 STG.E.U16 [C + 2x<3>], c3; + }; ++] + +// Restore beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +01:-:-:-:6 IADD C0.CC, C0, ldc8; +--:-:-:-:0 IADD.X C1, C1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_nt_128x128.sass b/Kernel/SGEMM/Pascal/hgemm_nt_128x128.sass new file mode 100644 index 0000000..29a50f0 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nt_128x128.sass @@ -0,0 +1,400 @@ +# Kernel: hgemm_nt_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[- + +our $int16; + +our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + +sub convert_in {return $convert;} + + +sub int16_params { + return $int16 ? q{ +param_Stats[0] : c[0x0][0x190] +param_Stats[1] : c[0x0][0x194] +param_scale : c[0x0][0x198] + } : ""; +} +-] + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + [+ int16_params() +] + + + + + 64-95 ~ tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid7, tid128, tid127, txa, txb, xmad_ta, xmad_tb, k1, k2, k3 + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-107 : loadA<0-5>, loadB<0-5> + + 108-111 : trackA<0-1>, trackB<0-1> + + 112-118 ~ writeS, k, tidY, ta, tb, loop + 119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ + + 64-75 ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128 + + 64-79 : c<0-7>, d3, d2, d1, d0, cs<0-3> + 64-65 : Stats<0-1> + 80-89 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 90-118 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 MOV loop, RZ; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); ++] + +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// tidY = tid1 << 2 +--:-:-:-:1 SHL tidY, tid1, 2; + +// tidX = tid >> 1 +01:-:-:-:1 SHR.U32 tidX, tid, 1; + +// trackA += 2 * ((blkA*128 + tidX) * lda + tidY) +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x1; + +// trackB += 2 * ((blkB*128 + tidX) * ldb + tidY) +04:-:-:-:1 ISCADD txb, blkB, tidX, 7; +--:-:-:-:1 XMAD.LO tb, ldb, txb, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = 4 * (128 * tidY + tidX) +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeS, writeS, 4x<128*8*2>, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + + +REMAINDER: + +[+ + our $vec; + return $vec ? q{ +// k must be multiple of 8 +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +--:-:2:-:1 @P5 LDG.E.CI.64 loadA0, [trackA + 2x<0>]; +--:-:2:-:1 @P5 LDG.E.CI.64 loadA4, [trackA + 2x<8>]; +--:-:4:-:1 @P6 LDG.E.CI.64 loadB0, [trackB + 2x<0>]; +--:5:6:-:1 @P6 LDG.E.CI.64 loadB4, [trackB + 2x<8>]; + +--:-:3:-:1 @!P5 LDS.U.64 loadA0, [addr_zero]; +--:-:3:-:1 @!P5 LDS.U.64 loadA4, [addr_zero]; +--:-:3:-:1 @!P6 LDS.U.64 loadB0, [addr_zero]; +--:-:3:-:2 @!P6 LDS.U.64 loadB4, [addr_zero]; + + // Vec 4 and scalar loads + } : q{ + +--:-:-:-:1 IADD k1, tidY, 1; +--:-:-:-:1 IADD k2, tidY, 2; +--:-:-:-:1 IADD k3, tidY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P6; + +--:-:4:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:1 LOP.AND.NZ P4, RZ, k, 7; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 8, P4; + + }; ++] + +[+ + our $vec; + our $convert; + return $vec ? qq{ + +06:-:1:-:4 $convert loadA3, loadA1.H1; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<16>; +--:-:2:-:4 $convert loadA2, loadA1.H0; +--:-:-:-:4 $convert loadA1, loadA0.H1; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; +--:-:3:-:1 $convert loadA0, loadA0.H0; + +01:-:-:-:1 STS [writeS + 4x<3*128>], loadA3; +02:-:-:-:1 STS [writeS + 4x<2*128>], loadA2; +04:-:-:-:1 STS [writeS + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeS + 4x<0*128>], loadA0; + +08:-:1:-:4 $convert loadB3, loadB1.H1; +10:-:-:-:0 IADD trackB0.CC, trackB0, 2x<16>; +--:-:2:-:4 $convert loadB2, loadB1.H0; +--:-:3:-:4 $convert loadB1, loadB0.H1; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; +--:-:4:-:1 $convert loadB0, loadB0.H0; + +01:-:-:-:1 STS [writeS + 4x<11*128>], loadB3; +02:-:-:-:1 STS [writeS + 4x<10*128>], loadB2; +04:-:-:-:1 STS [writeS + 4x< 9*128>], loadB1; +08:-:-:-:1 STS [writeS + 4x< 8*128>], loadB0; + + // scalar loads + } : qq{ + +02:-:-:-:4 $convert loadA0, loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<8>; +--:-:2:-:4 $convert loadA1, loadA1; +--:-:-:-:4 $convert loadA2, loadA2; +--:-:3:-:1 $convert loadA3, loadA3; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS [writeS + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeS + 4x<1*128>], loadA1; +04:-:-:-:1 STS [writeS + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeS + 4x<3*128>], loadA3; + +08:-:-:-:4 $convert loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, 2x<8>; +--:-:2:-:4 $convert loadB1, loadB1; +--:-:-:-:4 $convert loadB2, loadB2; +--:-:3:-:1 $convert loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +02:-:-:-:1 STS [writeS + 4x< 8*128>], loadB0; +--:-:-:-:1 STS [writeS + 4x< 9*128>], loadB1; +04:-:-:-:1 STS [writeS + 4x<10*128>], loadB2; +--:-:-:-:1 STS [writeS + 4x<11*128>], loadB3; + }; ++] + + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<128*8*2>; + + +[+ + our $vec; + our $convert; + our @top = $vec ? + ("--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n") : + ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, 16, P5;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + j0c13 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P5;\n", + j0c15 => "--:-:-:-:1 PSETP.AND.AND P3, PT, P0, P1, P6;\n", + + j0c27 => "--:-:2:-:1 \@P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];\n", + j0c29 => "--:-:2:-:1 \@P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];\n", + j0c31 => "--:-:4:-:1 \@P3 LDG.E.CI.64 loadB0, [trackB + 2x<0>];\n", + j0c33 => "20:5:6:-:1 \@P3 LDG.E.CI.64 loadB4, [trackB + 2x<8>];\n", + + j3c5 => "--:-:-:-:1 \@!P1 $convert loadA3, loadA5.H1;\n", + j3c9 => "--:-:-:-:1 \@!P1 $convert loadA2, loadA5.H0;\n", + j3c13 => "--:-:-:-:1 \@!P1 $convert loadA1, loadA4.H1;\n", + j3c17 => "--:-:-:-:1 \@!P1 $convert loadA0, loadA4.H0;\n", + + j4c5 => "--:-:-:-:1 \@!P1 $convert loadB3, loadB5.H1;\n", + j4c9 => "--:-:-:-:1 \@!P1 $convert loadB2, loadB5.H0;\n", + j4c13 => "--:-:-:-:1 \@!P1 $convert loadB1, loadB4.H1;\n", + j4c17 => "--:-:-:-:1 \@!P1 $convert loadB0, loadB4.H0;\n", + + j5c5 => "02:-:-:-:1 \@P1 $convert loadA3, loadA1.H1;\n", + j5c9 => "--:-:2:-:1 \@P1 $convert loadA2, loadA1.H0;\n", + j5c13 => "--:-:-:-:1 \@P1 $convert loadA1, loadA0.H1;\n", + j5c17 => "--:-:3:-:1 \@P1 $convert loadA0, loadA0.H0;\n", + + j5c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x<3*128>], loadA3;\n", + j5c31 => "--:-:-:-:1 \@P0 STS [writeS + 4x<2*128>], loadA2;\n", + j5c33 => "04:-:-:-:1 \@P0 STS [writeS + 4x<1*128>], loadA1;\n", + j5c35 => "--:-:-:-:1 \@P0 STS [writeS + 4x<0*128>], loadA0;\n", + + j6c5 => "08:-:-:-:1 \@P1 $convert loadB3, loadB1.H1;\n", + j6c9 => "--:-:2:-:1 \@P1 $convert loadB2, loadB1.H0;\n", + j6c13 => "--:-:3:-:1 \@P1 $convert loadB1, loadB0.H1;\n", + j6c17 => "--:-:4:-:1 \@P1 $convert loadB0, loadB0.H0;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x<11*128>], loadB3;\n", + j6c31 => "--:-:-:-:1 \@P0 STS [writeS + 4x<10*128>], loadB2;\n", + j6c33 => "04:-:-:-:1 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n", + j6c35 => "08:-:-:-:1 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 2x<16>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c46 => "10:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, 2x<16>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : + ( + j0c7 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 16, P6;\n", + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n", + j0c12 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n", + j0c14 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n", + j0c16 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n", + + j0c29 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j0c31 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j0c33 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j0c35 => "--:-:6:-:1 \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j5c5 => "02:-:2:-:1 \@P2 $convert loadA0, loadA0;\n", + j5c9 => "--:-:3:-:1 \@P2 $convert loadA1, loadA1;\n", + j5c13 => "--:-:4:-:1 \@P2 $convert loadA2, loadA2;\n", + j5c17 => "--:-:5:-:1 \@P2 $convert loadA3, loadA3;\n", + + j5c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x<0*128>], loadA0;\n", + j5c31 => "04:-:-:-:1 \@P0 STS [writeS + 4x<1*128>], loadA1;\n", + j5c33 => "08:-:-:-:1 \@P0 STS [writeS + 4x<2*128>], loadA2;\n", + j5c35 => "10:-:-:-:1 \@P0 STS [writeS + 4x<3*128>], loadA3;\n", + + j6c5 => "20:-:2:-:1 \@P3 $convert loadB0, loadB0;\n", + j6c9 => "--:-:3:-:1 \@P3 $convert loadB1, loadB1;\n", + j6c13 => "--:-:4:-:1 \@P3 $convert loadB2, loadB2;\n", + j6c17 => "--:-:5:-:1 \@P3 $convert loadB3, loadB3;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n", + j6c31 => "04:-:-:-:1 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n", + j6c33 => "08:-:-:-:1 \@P0 STS [writeS + 4x<10*128>], loadB2;\n", + j6c35 => "10:-:-:-:1 \@P0 STS [writeS + 4x<11*128>], loadB3;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 2x<8>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c46 => "10:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, 2x<8>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ) + ), + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" . + "--:-:-:-:1 IADD32I k, k, -8;\n", + ); + return; ++] + + diff --git a/Kernel/SGEMM/Pascal/hgemm_nt_16x64.sass b/Kernel/SGEMM/Pascal/hgemm_nt_16x64.sass new file mode 100644 index 0000000..ce5e6ef --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nt_16x64.sass @@ -0,0 +1,1185 @@ +# Kernel: hgemm_nt_16x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(16*64 + 32)*2 + (64*64 + 32)*2> + szShareA : (16*64 + 32) + szShareB : (64*64 + 32) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 64-95 ~ tidX, tidY, tidY<1-3>, lda, ldb, ldaz, ldbz, ldb16, tid16_8, ta, txa, tb<00|16|32|48>, txb<00|16|32|48>, xmad_ta, xmad_tb, shiftX, predsY0, predsY4, partialK + + 96-135 : load0A<0-7>, load0B<0-7>, load1B<0-7>, load2B<0-7>, load3B<0-7> + 136-145 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 146-152 ~ swapBuf, readAs, readBs, writeAs, writeBs, k + 153-159 ~ tid, blkA, blkB, blkZ, writeCs, preds, tid16 + + 0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3> + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + 96-99 : loadC<0-3> + 100-103 : b<0-3> + 104-107 : c<0-3> + 108-109 : C<0-1> + 110-152 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb16, ldb, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// tidX = tid >> 3 +// tidY = (tid & 7) << 3 +// shiftX = (tid & 7) << 2 +01:-:-:-:1 SHR.U32 tidX, tid, 3; +--:-:-:-:1 LOP.AND tidY, tid, 7; +--:-:-:-:1 SHL shiftX, tidY, 2; +--:-:-:-:1 SHL tidY, tidY, 3; + +// trackA += ((blkA*16 + tidX) * lda + tidY) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidX, 4; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa, param_m, PT; + +// trackB += ((blkB*64 + tidX) * ldb + tidY) * 2 +04:-:-:-:1 ISCADD txb00, blkB, tidX, 6; +--:-:-:-:1 IADD txb16, txb00, 16; +--:-:-:-:1 IADD txb32, txb00, 32; +--:-:-:-:1 IADD txb48, txb00, 48; +--:-:-:-:1 XMAD.LO tb00, ldb, txb00, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb00, ldbz, blkZ, tb00; +--:-:-:-:1 IADD tb16, tb00, ldb16; +--:-:-:-:1 IADD tb32, tb16, ldb16; +--:-:-:-:1 IADD tb48, tb32, ldb16; +--:-:-:-:1 LEA track0B0.CC, tb00, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb00, param_B[1], RZ, 1; +--:-:-:-:1 LEA track1B0.CC, tb16, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb16, param_B[1], RZ, 1; +--:-:-:-:1 LEA track2B0.CC, tb32, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb32, param_B[1], RZ, 1; +--:-:-:-:1 LEA track3B0.CC, tb48, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb48, param_B[1], RZ, 1; + + +--:-:-:-:1 ISETP.LT.AND P3, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb16, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb48, param_n, PT; + +--:-:-:-:1 P2R preds, PR, RZ, 0x7c; + +// writeAs = (tidY*16 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 4; +--:-:-:-:1 IADD writeAs, writeAs, shiftX; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (tidY*64 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 6; +--:-:-:-:1 IADD writeBs, writeBs, shiftX; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (tid & 1) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) << 4 +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHL readBs, readBs, 4; + +// tid16 = tid & -16 +// tid16_8 = tid16 / 2 * 4 +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHL tid16_8, tid16, 1; + +// writeCs = (readAs + tid16*2) * 64 + readBs; +--:-:-:-:1 ISCADD writeCs, tid16, readAs, 1; +--:-:-:-:1 ISCADD writeCs, writeCs, readBs, 6; + +// Each block of 16 threads works on 8 lines, shifted over by 4 +// readAs += tid16_8 * 16 + tid16 +// readBs += tid16_8 * 64 + tid16 + 4x +--:-:-:-:1 ISCADD readAs, tid16_8, readAs, 4; +--:-:-:-:1 ISCADD readBs, tid16_8, readBs, 6; +--:-:-:-:1 IADD readAs, tid16, readAs; +--:-:-:-:1 IADD3 readBs, tid16, 4x, readBs; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// If k is not a multiple of 64 we want to grab the partial amount on the first fetch. +// If it is a multiple of 64 then make a full 64 line fetch. +--:-:-:-:1 LOP.AND.Z P0, partialK, k, 63; +--:-:-:-:1 @P0 MOV partialK, 64; +--:-:-:-:1 IADD k, k, -partialK; +[+ + our $vec; + return $vec ? q{ + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x7c; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c; + + +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.128 load0B, [track0B]; +--:-:4:-:1 @P4 LDG.E.CI.128 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load2B, [track2B]; +--:-:6:-:1 @P6 LDG.E.CI.128 load3B, [track3B]; + + + +--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.128 load0B, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 load1B, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.128 load2B, [addr_zero]; +--:-:1:-:1 @!P6 LDS.U.128 load3B, [addr_zero]; + + + } : q{ +--:-:-:-:1 IADD tidY1, tidY, 1; +--:-:-:-:1 IADD tidY2, tidY, 2; +--:-:-:-:1 IADD tidY3, tidY, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, partialK, PT; +--:-:-:-:1 P2R predsY0, PR, RZ, 0x0f; + +--:-:-:-:1 IADD tidY, tidY, 4; +--:-:-:-:1 IADD tidY1, tidY1, 4; +--:-:-:-:1 IADD tidY2, tidY2, 4; +--:-:-:-:1 IADD tidY3, tidY3, 4; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, partialK, PT; +--:-:-:-:1 P2R predsY4, PR, RZ, 0x0f; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa, param_m, PT; +--:-:-:-:1 @P4 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 @P4 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load0A4, RZ; +--:-:-:-:1 @!P1 MOV load0A5, RZ; +--:-:-:-:1 @!P2 MOV load0A6, RZ; +--:-:-:-:1 @!P3 MOV load0A7, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb00, param_n, PT; +--:-:-:-:1 @P5 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 @P5 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0B4, [track0B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0B5, [track0B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0B6, [track0B + 2x<6>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load0B4, RZ; +--:-:-:-:1 @!P1 MOV load0B5, RZ; +--:-:-:-:1 @!P2 MOV load0B6, RZ; +--:-:-:-:1 @!P3 MOV load0B7, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb16, param_n, PT; +--:-:-:-:1 @P6 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 @P6 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1B4, [track1B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1B5, [track1B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1B6, [track1B + 2x<6>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B7, [track1B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load1B4, RZ; +--:-:-:-:1 @!P1 MOV load1B5, RZ; +--:-:-:-:1 @!P2 MOV load1B6, RZ; +--:-:-:-:1 @!P3 MOV load1B7, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb32, param_n, PT; +--:-:-:-:1 @P4 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 @P4 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load2B4, [track2B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load2B5, [track2B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load2B6, [track2B + 2x<6>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 load2B7, [track2B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load2B4, RZ; +--:-:-:-:1 @!P1 MOV load2B5, RZ; +--:-:-:-:1 @!P2 MOV load2B6, RZ; +--:-:-:-:1 @!P3 MOV load2B7, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb48, param_n, PT; +--:-:-:-:1 @P6 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:6:-:1 @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 @P6 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load3B4, [track3B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load3B5, [track3B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load3B6, [track3B + 2x<6>]; +--:-:6:-:1 @P3 LDG.E.CI.U16 load3B7, [track3B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load3B4, RZ; +--:-:-:-:1 @!P1 MOV load3B5, RZ; +--:-:-:-:1 @!P2 MOV load3B6, RZ; +--:-:-:-:1 @!P3 MOV load3B7, RZ; + }; ++] +--:-:-:-:1 SHL partialK, partialK, 1; + +--:-:-:-:1 ISETP.GE.AND P0, PT, k, 64, PT; +--:-:-:-:1 IADD k, k, -64; +--:-:-:-:1 @P0 R2P PR, preds, 0x7c; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x7c; + + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:1 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A2.H1; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0.H0; + } : q{ +02:-:-:-:1 F2F.F32.F16 load0A7, load0A7; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A6; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A5; +--:-:1:-:1 F2F.F32.F16 load0A4, load0A4; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A3; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0; + }; ++] +--:-:-:-:0 IADD track0A0.CC, track0A0, partialK; +01:-:-:-:1 STS [writeAs + 4x<7*16>], load0A7; +--:-:-:-:1 STS [writeAs + 4x<6*16>], load0A6; +--:-:-:-:1 STS [writeAs + 4x<5*16>], load0A5; +--:-:-:-:1 STS [writeAs + 4x<4*16>], load0A4; +02:-:-:-:1 STS [writeAs + 4x<3*16>], load0A3; +--:-:-:-:1 STS [writeAs + 4x<2*16>], load0A2; +--:-:-:-:1 STS [writeAs + 4x<1*16>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<0*16>], load0A0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +[+ + our $vec; + return $vec ? q{ +04:-:-:-:1 F2F.F32.F16 load0B7, load0B3.H1; +--:-:-:-:1 F2F.F32.F16 load0B6, load0B3.H0; +--:-:-:-:1 F2F.F32.F16 load0B5, load0B2.H1; +--:-:1:-:1 F2F.F32.F16 load0B4, load0B2.H0; +--:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:2:-:1 F2F.F32.F16 load0B0, load0B0.H0; + } : q{ +04:-:-:-:1 F2F.F32.F16 load0B7, load0B7; +--:-:-:-:1 F2F.F32.F16 load0B6, load0B6; +--:-:-:-:1 F2F.F32.F16 load0B5, load0B5; +--:-:1:-:1 F2F.F32.F16 load0B4, load0B4; +--:-:-:-:1 F2F.F32.F16 load0B3, load0B3; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:2:-:1 F2F.F32.F16 load0B0, load0B0; + }; ++] +--:-:-:-:0 IADD track0B0.CC, track0B0, partialK; +01:-:-:-:1 STS [writeBs + 4x<7*64 + 0*16>], load0B7; +--:-:-:-:1 STS [writeBs + 4x<6*64 + 0*16>], load0B6; +--:-:-:-:1 STS [writeBs + 4x<5*64 + 0*16>], load0B5; +--:-:-:-:1 STS [writeBs + 4x<4*64 + 0*16>], load0B4; +02:-:-:-:1 STS [writeBs + 4x<3*64 + 0*16>], load0B3; +--:-:-:-:1 STS [writeBs + 4x<2*64 + 0*16>], load0B2; +--:-:-:-:1 STS [writeBs + 4x<1*64 + 0*16>], load0B1; +--:-:-:-:1 STS [writeBs + 4x<0*64 + 0*16>], load0B0; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +[+ + our $vec; + return $vec ? q{ +08:-:-:-:1 F2F.F32.F16 load1B7, load1B3.H1; +--:-:-:-:1 F2F.F32.F16 load1B6, load1B3.H0; +--:-:-:-:1 F2F.F32.F16 load1B5, load1B2.H1; +--:-:1:-:1 F2F.F32.F16 load1B4, load1B2.H0; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + } : q{ +08:-:-:-:1 F2F.F32.F16 load1B7, load1B7; +--:-:-:-:1 F2F.F32.F16 load1B6, load1B6; +--:-:-:-:1 F2F.F32.F16 load1B5, load1B5; +--:-:1:-:1 F2F.F32.F16 load1B4, load1B4; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B3; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0; + }; ++] +--:-:-:-:0 IADD track1B0.CC, track1B0, partialK; +01:-:-:-:1 STS [writeBs + 4x<7*64 + 1*16>], load1B7; +--:-:-:-:1 STS [writeBs + 4x<6*64 + 1*16>], load1B6; +--:-:-:-:1 STS [writeBs + 4x<5*64 + 1*16>], load1B5; +--:-:-:-:1 STS [writeBs + 4x<4*64 + 1*16>], load1B4; +02:-:-:-:1 STS [writeBs + 4x<3*64 + 1*16>], load1B3; +--:-:-:-:1 STS [writeBs + 4x<2*64 + 1*16>], load1B2; +--:-:-:-:1 STS [writeBs + 4x<1*64 + 1*16>], load1B1; +--:-:-:-:1 STS [writeBs + 4x<0*64 + 1*16>], load1B0; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +[+ + our $vec; + return $vec ? q{ +10:-:-:-:1 F2F.F32.F16 load2B7, load2B3.H1; +--:-:-:-:1 F2F.F32.F16 load2B6, load2B3.H0; +--:-:-:-:1 F2F.F32.F16 load2B5, load2B2.H1; +--:-:1:-:1 F2F.F32.F16 load2B4, load2B2.H0; +--:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:2:-:1 F2F.F32.F16 load2B0, load2B0.H0; + } : q{ +10:-:-:-:1 F2F.F32.F16 load2B7, load2B7; +--:-:-:-:1 F2F.F32.F16 load2B6, load2B6; +--:-:-:-:1 F2F.F32.F16 load2B5, load2B5; +--:-:1:-:1 F2F.F32.F16 load2B4, load2B4; +--:-:-:-:1 F2F.F32.F16 load2B3, load2B3; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:2:-:1 F2F.F32.F16 load2B0, load2B0; + }; ++] +--:-:-:-:0 IADD track2B0.CC, track2B0, partialK; +01:-:-:-:1 STS [writeBs + 4x<7*64 + 2*16>], load2B7; +--:-:-:-:1 STS [writeBs + 4x<6*64 + 2*16>], load2B6; +--:-:-:-:1 STS [writeBs + 4x<5*64 + 2*16>], load2B5; +--:-:-:-:1 STS [writeBs + 4x<4*64 + 2*16>], load2B4; +02:-:-:-:1 STS [writeBs + 4x<3*64 + 2*16>], load2B3; +--:-:-:-:1 STS [writeBs + 4x<2*64 + 2*16>], load2B2; +--:-:-:-:1 STS [writeBs + 4x<1*64 + 2*16>], load2B1; +--:-:-:-:1 STS [writeBs + 4x<0*64 + 2*16>], load2B0; +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +[+ + our $vec; + return $vec ? q{ +20:-:-:-:1 F2F.F32.F16 load3B7, load3B3.H1; +--:-:-:-:1 F2F.F32.F16 load3B6, load3B3.H0; +--:-:-:-:1 F2F.F32.F16 load3B5, load3B2.H1; +--:-:1:-:1 F2F.F32.F16 load3B4, load3B2.H0; +--:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:2:-:1 F2F.F32.F16 load3B0, load3B0.H0; + } : q{ +20:-:-:-:1 F2F.F32.F16 load3B7, load3B7; +--:-:-:-:1 F2F.F32.F16 load3B6, load3B6; +--:-:-:-:1 F2F.F32.F16 load3B5, load3B5; +--:-:1:-:1 F2F.F32.F16 load3B4, load3B4; +--:-:-:-:1 F2F.F32.F16 load3B3, load3B3; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:2:-:1 F2F.F32.F16 load3B0, load3B0; + }; ++] +--:-:-:-:0 IADD track3B0.CC, track3B0, partialK; +01:-:-:-:1 STS [writeBs + 4x<7*64 + 3*16>], load3B7; +--:-:-:-:1 STS [writeBs + 4x<6*64 + 3*16>], load3B6; +--:-:-:-:1 STS [writeBs + 4x<5*64 + 3*16>], load3B5; +--:-:-:-:1 STS [writeBs + 4x<4*64 + 3*16>], load3B4; +02:-:-:-:1 STS [writeBs + 4x<3*64 + 3*16>], load3B3; +--:-:-:-:1 STS [writeBs + 4x<2*64 + 3*16>], load3B2; +--:-:-:-:1 STS [writeBs + 4x<1*64 + 3*16>], load3B1; +--:-:-:-:1 STS [writeBs + 4x<0*64 + 3*16>], load3B0; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*16 + 00>]; +--:-:-:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*16 + 08>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +[+ + our $vec; + return $vec ? q{ +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.128 load0B, [track0B]; +--:-:4:-:1 @P4 LDG.E.CI.128 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load2B, [track2B]; +--:-:6:-:1 @P6 LDG.E.CI.128 load3B, [track3B]; + } : q{ +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B4, [track0B + 2x<4>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B5, [track0B + 2x<5>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load0B6, [track0B + 2x<6>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>]; + +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B4, [track1B + 2x<4>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B5, [track1B + 2x<5>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load1B6, [track1B + 2x<6>]; +--:-:4:-:1 @P4 LDG.E.CI.U16 load1B7, [track1B + 2x<7>]; + +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B4, [track2B + 2x<4>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B5, [track2B + 2x<5>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load2B6, [track2B + 2x<6>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2B7, [track2B + 2x<7>]; + +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B4, [track3B + 2x<4>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B5, [track3B + 2x<5>]; +--:-:-:-:1 @P6 LDG.E.CI.U16 load3B6, [track3B + 2x<6>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 load3B7, [track3B + 2x<7>]; + }; ++] + +LOOP: + +[+ + our $vec; + our %insert = + ( + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, RZ, PT;\n", + j0c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, k, 64, PT;\n" . + "--:-:-:-:1 IADD k, k, -64;\n", + + j0c23 => "--:-:-:-:1 \@P1 R2P PR, preds, 0x7c;\n", + j0c24 => "--:-:-:-:1 \@!P1 R2P PR, RZ, 0x7c;\n", + + j2c32 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<64>;\n", + j2c37 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j3c32 => "--:-:-:-:1 \@P3 IADD track0B0.CC, track0B0, 2x<64>;\n", + j3c37 => "--:-:-:-:1 \@P3 IADD.X track0B1, track0B1, RZ;\n", + j4c32 => "--:-:-:-:1 \@P4 IADD track1B0.CC, track1B0, 2x<64>;\n", + j4c37 => "--:-:-:-:1 \@P4 IADD.X track1B1, track1B1, RZ;\n", + j5c32 => "--:-:-:-:1 \@P5 IADD track2B0.CC, track2B0, 2x<64>;\n", + j5c37 => "--:-:-:-:1 \@P5 IADD.X track2B1, track2B1, RZ;\n", + j6c32 => "--:-:-:-:1 \@P6 IADD track3B0.CC, track3B0, 2x<64>;\n", + j6c37 => "--:-:-:-:1 \@P6 IADD.X track3B1, track3B1, RZ;\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n", + j1c63 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j2c36 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B7, load0B3.H1;\n", + j2c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B6, load0B3.H0;\n", + j2c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B5, load0B2.H1;\n", + j2c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B4, load0B2.H0;\n", + j2c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j3c36 => "08:-:-:-:1 \@P0 F2F.F32.F16 load1B7, load1B3.H1;\n", + j3c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B6, load1B3.H0;\n", + j3c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B5, load1B2.H1;\n", + j3c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B4, load1B2.H0;\n", + j3c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j4c36 => "10:-:-:-:1 \@P0 F2F.F32.F16 load2B7, load2B3.H1;\n", + j4c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B6, load2B3.H0;\n", + j4c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B5, load2B2.H1;\n", + j4c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B4, load2B2.H0;\n", + j4c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n", + + j5c36 => "20:-:-:-:1 \@P0 F2F.F32.F16 load3B7, load3B3.H1;\n", + j5c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B6, load3B3.H0;\n", + j5c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B5, load3B2.H1;\n", + j5c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B4, load3B2.H0;\n", + j5c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j2c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<7*16>], load0A7;\n", + j2c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*16>], load0A6;\n", + j2c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*16>], load0A5;\n", + j2c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*16>], load0A4;\n", + j2c24 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*16>], load0A3;\n", + j2c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*16>], load0A2;\n", + j2c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*16>], load0A1;\n", + j2c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<0*16>], load0A0;\n", + + j3c16 => "04:-:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 0*16>], load0B7;\n", + j3c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 0*16>], load0B6;\n", + j3c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 0*16>], load0B5;\n", + j3c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 0*16>], load0B4;\n", + j3c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 0*16>], load0B3;\n", + j3c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 0*16>], load0B2;\n", + j3c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 0*16>], load0B1;\n", + j3c30 => "--:3:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 0*16>], load0B0;\n", + + j4c16 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 1*16>], load1B7;\n", + j4c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 1*16>], load1B6;\n", + j4c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 1*16>], load1B5;\n", + j4c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 1*16>], load1B4;\n", + j4c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 1*16>], load1B3;\n", + j4c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 1*16>], load1B2;\n", + j4c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 1*16>], load1B1;\n", + j4c30 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 1*16>], load1B0;\n", + + j5c16 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 2*16>], load2B7;\n", + j5c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 2*16>], load2B6;\n", + j5c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 2*16>], load2B5;\n", + j5c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 2*16>], load2B4;\n", + j5c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 2*16>], load2B3;\n", + j5c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 2*16>], load2B2;\n", + j5c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 2*16>], load2B1;\n", + j5c30 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 2*16>], load2B0;\n", + + j6c16 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 3*16>], load3B7;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 3*16>], load3B6;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 3*16>], load3B5;\n", + j6c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 3*16>], load3B4;\n", + j6c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 3*16>], load3B3;\n", + j6c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 3*16>], load3B2;\n", + j6c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 3*16>], load3B1;\n", + j6c30 => "--:6:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 3*16>], load3B0;\n", + + j2c61 => "02:-:2:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j3c61 => "04:-:3:-:1 \@P3 LDG.E.CI.128 load0B, [track0B];\n", + j4c61 => "08:-:4:-:1 \@P4 LDG.E.CI.128 load1B, [track1B];\n", + j5c61 => "10:-:5:-:1 \@P5 LDG.E.CI.128 load2B, [track2B];\n", + j6c61 => "20:-:6:-:1 \@P6 LDG.E.CI.128 load3B, [track3B];\n", + ) : + ( + j1c35 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A0, load0A0;\n", + j1c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A1;\n", + j1c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A2;\n", + j1c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A3;\n", + j1c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A4;\n", + j1c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A5;\n", + j1c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A6;\n", + j1c63 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A7, load0A7;\n", + + j2c36 => "04:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0;\n", + j2c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B1;\n", + j2c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B2;\n", + j2c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B3;\n", + j2c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B4, load0B4;\n", + j2c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B5, load0B5;\n", + j2c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B6, load0B6;\n", + j2c63 => "--:-:3:-:1 \@P0 F2F.F32.F16 load0B7, load0B7;\n", + + j3c36 => "08:-:-:-:1 \@P0 F2F.F32.F16 load1B0, load1B0;\n", + j3c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B1;\n", + j3c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B2;\n", + j3c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B3;\n", + j3c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B4, load1B4;\n", + j3c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B5, load1B5;\n", + j3c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B6, load1B6;\n", + j3c63 => "--:-:4:-:1 \@P0 F2F.F32.F16 load1B7, load1B7;\n", + + j4c36 => "10:-:-:-:1 \@P0 F2F.F32.F16 load2B0, load2B0;\n", + j4c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B1, load2B1;\n", + j4c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B2, load2B2;\n", + j4c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B3, load2B3;\n", + j4c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B4, load2B4;\n", + j4c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B5, load2B5;\n", + j4c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load2B6, load2B6;\n", + j4c63 => "--:-:5:-:1 \@P0 F2F.F32.F16 load2B7, load2B7;\n", + + j5c36 => "20:-:-:-:1 \@P0 F2F.F32.F16 load3B0, load3B0;\n", + j5c39 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B1, load3B1;\n", + j5c43 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B2, load3B2;\n", + j5c47 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B3, load3B3;\n", + j5c51 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B4, load3B4;\n", + j5c55 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B5, load3B5;\n", + j5c59 => "--:-:-:-:1 \@P0 F2F.F32.F16 load3B6, load3B6;\n", + j5c63 => "--:-:6:-:1 \@P0 F2F.F32.F16 load3B7, load3B7;\n", + + j2c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<0*16>], load0A0;\n", + j2c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*16>], load0A1;\n", + j2c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*16>], load0A2;\n", + j2c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*16>], load0A3;\n", + j2c24 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*16>], load0A4;\n", + j2c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*16>], load0A5;\n", + j2c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*16>], load0A6;\n", + j2c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<7*16>], load0A7;\n", + + j3c16 => "04:-:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 0*16>], load0B0;\n", + j3c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 0*16>], load0B1;\n", + j3c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 0*16>], load0B2;\n", + j3c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 0*16>], load0B3;\n", + j3c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 0*16>], load0B4;\n", + j3c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 0*16>], load0B5;\n", + j3c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 0*16>], load0B6;\n", + j3c30 => "--:3:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 0*16>], load0B7;\n", + + j4c16 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 1*16>], load1B0;\n", + j4c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 1*16>], load1B1;\n", + j4c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 1*16>], load1B2;\n", + j4c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 1*16>], load1B3;\n", + j4c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 1*16>], load1B4;\n", + j4c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 1*16>], load1B5;\n", + j4c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 1*16>], load1B6;\n", + j4c30 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 1*16>], load1B7;\n", + + j5c16 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 2*16>], load2B0;\n", + j5c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 2*16>], load2B1;\n", + j5c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 2*16>], load2B2;\n", + j5c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 2*16>], load2B3;\n", + j5c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 2*16>], load2B4;\n", + j5c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 2*16>], load2B5;\n", + j5c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 2*16>], load2B6;\n", + j5c30 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 2*16>], load2B7;\n", + + j6c16 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<0*64 + 3*16>], load3B0;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*64 + 3*16>], load3B1;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*64 + 3*16>], load3B2;\n", + j6c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*64 + 3*16>], load3B3;\n", + j6c24 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*64 + 3*16>], load3B4;\n", + j6c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*64 + 3*16>], load3B5;\n", + j6c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*64 + 3*16>], load3B6;\n", + j6c30 => "--:6:-:-:1 \@P0 STS [writeBs + 4x<7*64 + 3*16>], load3B7;\n", + + j2c48 => "02:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j2c50 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j2c52 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j2c54 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + j2c56 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n", + j2c58 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n", + j2c60 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n", + j2c62 => "--:-:2:-:1 \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n", + + j3c48 => "04:-:-:-:1 \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c50 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j3c52 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j3c54 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + j3c56 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];\n", + j3c58 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];\n", + j3c60 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];\n", + j3c62 => "--:-:3:-:1 \@P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];\n", + + j4c48 => "08:-:-:-:1 \@P4 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j4c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j4c52 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j4c54 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + j4c56 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];\n", + j4c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];\n", + j4c60 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];\n", + j4c62 => "--:-:4:-:1 \@P4 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];\n", + + j5c48 => "10:-:-:-:1 \@P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j5c50 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j5c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j5c54 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + j5c56 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B4, [track2B + 2x<4>];\n", + j5c58 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B5, [track2B + 2x<5>];\n", + j5c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2B6, [track2B + 2x<6>];\n", + j5c62 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load2B7, [track2B + 2x<7>];\n", + + j6c48 => "20:-:-:-:1 \@P6 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j6c50 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j6c52 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j6c54 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + j6c56 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B4, [track3B + 2x<4>];\n", + j6c58 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B5, [track3B + 2x<5>];\n", + j6c60 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 load3B6, [track3B + 2x<6>];\n", + j6c62 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 load3B7, [track3B + 2x<7>];\n", + ) + ), + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out = ''; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*16 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*16 + 08>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHR.U32 tid16, tid, 4; +--:-:-:-:1 SHL tid15, tid15, 2; +--:-:-:-:1 ISCADD readCs, tid16, tid15, 6; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*64 + tid15; +--:-:-:-:1 ISCADD cx, blkB, tid15, 6; +--:-:-:-:1 IADD cx1, cx, 1; +--:-:-:-:1 IADD cx2, cx, 2; +--:-:-:-:1 IADD cx3, cx, 3; + +// cy = blkA*16 + tid16 +--:-:-:-:1 ISCADD cy, blkA, tid16, 4; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 SHL ldc8, ldc, 4; + +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C1, ci, param_C[1], RZ, 1; + +// P0 = cx < n +--:-:-:-:1 ISETP.LT.AND P0, PT, cx, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, cx1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, cx2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, cx3, param_n, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +// P4 = cy < m +--:-:-:-:1 ISETP.LT.AND P4, PT, cy, param_m, PT; + +// P5 = beta != 0 && P4 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P4; + +// P6 = Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; + +// Init beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:5 EXIT; + +STORE_C: + +[+ + our $vec; + return $vec ? q{ +--:-:1:-:1 @P0 LDG.E.64 loadC, [C]; + } : q{ +--:-:-:-:0 @!P0 MOV loadC0, RZ; +--:-:-:-:1 @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>]; +--:-:-:-:0 @!P1 MOV loadC1, RZ; +--:-:-:-:1 @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>]; +--:-:-:-:0 @!P2 MOV loadC2, RZ; +--:-:-:-:1 @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>]; +--:-:-:-:0 @!P3 MOV loadC3, RZ; +--:-:1:-:1 @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>]; + }; ++] + +// Restore output preds +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 LDS.U.128 part0C, [readCs + 4x<0*8*64>]; +--:-:2:-:1 LDS.U.128 part1C, [readCs + 4x<1*8*64>]; +--:-:-:-:1 LDS.U.128 part2C, [readCs + 4x<2*8*64>]; +--:-:3:-:1 LDS.U.128 part3C, [readCs + 4x<3*8*64>]; +--:-:-:-:1 LDS.U.128 part4C, [readCs + 4x<4*8*64>]; +--:-:4:-:1 LDS.U.128 part5C, [readCs + 4x<5*8*64>]; +--:-:-:-:1 LDS.U.128 part6C, [readCs + 4x<6*8*64>]; +--:-:5:-:1 LDS.U.128 part7C, [readCs + 4x<7*8*64>]; + + +02:-:-:-:1 @P0 FADD part0C0, part0C0, part1C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part1C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part1C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part1C3; + +04:-:-:-:1 @P0 FADD part2C0, part2C0, part3C0; +--:-:-:-:1 @P1 FADD part2C1, part2C1, part3C1; +--:-:-:-:1 @P2 FADD part2C2, part2C2, part3C2; +--:-:-:-:1 @P3 FADD part2C3, part2C3, part3C3; + +08:-:-:-:1 @P0 FADD part4C0, part4C0, part5C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part5C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part5C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part5C3; + +10:-:-:-:1 @P0 FADD part6C0, part6C0, part7C0; +--:-:-:-:1 @P1 FADD part6C1, part6C1, part7C1; +--:-:-:-:1 @P2 FADD part6C2, part6C2, part7C2; +--:-:-:-:1 @P3 FADD part6C3, part6C3, part7C3; + +--:-:-:-:1 @P0 FADD part0C0, part0C0, part2C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part2C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part2C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part2C3; + +--:-:-:-:1 @P0 FADD part4C0, part4C0, part6C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part6C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part6C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part6C3; + +--:-:-:-:1 @P0 FADD c0, part0C0, part4C0; +--:-:-:-:1 @P1 FADD c1, part0C1, part4C1; +--:-:-:-:1 @P2 FADD c2, part0C2, part4C2; +--:-:-:-:1 @P3 FADD c3, part0C3, part4C3; + + +--:-:-:-:0 IADD cy, cy, 8; + +[+ + our $vec; + return $vec ? q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0.H0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC0.H1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC1.H0; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC1.H1; + } : q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC2; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC3; + }; ++] + +01:-:-:-:1 @P5 FFMA c0, b0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, b1, beta, c1; +04:-:-:-:1 @P5 FFMA c2, b2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, b3, beta, c3; + +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:0 ISETP.LT.AND P5, PT, cy, param_m, P5; + +--:-:1:-:1 @P0 F2F.F16.F32 c0, c0; +--:-:2:-:1 @P1 F2F.F16.F32 c1, c1; + +--:-:-:-:0 ISETP.LT.AND P4, PT, cy, param_m, PT; + +--:-:3:-:1 @P2 F2F.F16.F32 c2, c2; +--:-:4:-:1 @P3 F2F.F16.F32 c3, c3; + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:2 @P0 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 @P0 BFI c1, c3, 0x1010, c2; + +--:1:-:-:1 @P0 STG.E.CG.64 [C], c; + } : q{ +01:-:-:-:1 @P0 STG.E.U16 [C + 2x<0>], c0; +02:-:-:-:1 @P1 STG.E.U16 [C + 2x<1>], c1; +04:-:-:-:1 @P2 STG.E.U16 [C + 2x<2>], c2; +08:1:-:-:1 @P3 STG.E.U16 [C + 2x<3>], c3; + }; ++] + +// Restore beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +01:-:-:-:6 IADD C0.CC, C0, ldc8; +--:-:-:-:0 IADD.X C1, C1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_nt_32x128.sass b/Kernel/SGEMM/Pascal/hgemm_nt_32x128.sass new file mode 100644 index 0000000..eef6e5e --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nt_32x128.sass @@ -0,0 +1,588 @@ +# Kernel: hgemm_nt_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + (32*16 + 32)*2> + szShareA : (32*16 + 32) + szShareB : (128*16 + 32) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ tidX, lda, ldb, ldaz, ldbz, ldb32, tid1, tid3, tid96, ta, tb00, tb32, tb64, tb96, xmad_ta, xmad_tb, shiftX, tidY<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadA<0-3> + 84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + + 100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 110-120 ~ writeAs, writeBs, k, tidY, txa, txb00, txb32, txb64, txb96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; +--:-:3:-:1 S2R blkA, SR_CTAID.Y; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb32, ldb, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidX = tid >> 2 +// tidY = (tid & 3) << 2 +// shiftX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidY, tid3, 2; +--:-:-:-:1 SHL shiftX, tid3, 3; + +// trackA += ((blkA*32 + tidX) * lda + tidAY) * 2 +04:-:-:-:1 ISCADD txa, blkA, tidX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 1; + +// trackB += ((blkB*128 + tidX) * ldb + tidY) * 2 +02:-:-:-:1 ISCADD txb00, blkB, tidX, 7; +--:-:-:-:1 IADD txb32, txb00, 32; +--:-:-:-:1 IADD txb64, txb00, 64; +--:-:-:-:1 IADD txb96, txb00, 96; + +--:-:-:-:1 XMAD.LO tb00, ldb, txb00, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb00, ldbz, blkZ, tb00; +--:-:-:-:1 IADD tb32, tb00, ldb32; +--:-:-:-:1 IADD tb64, tb32, ldb32; +--:-:-:-:1 IADD tb96, tb64, ldb32; + +--:-:-:-:1 LEA track0B0.CC, tb00, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb00, param_B[1], RZ, 1; +--:-:-:-:1 LEA track1B0.CC, tb32, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb32, param_B[1], RZ, 1; +--:-:-:-:1 LEA track2B0.CC, tb64, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track2B1, tb64, param_B[1], RZ, 1; +--:-:-:-:1 LEA track3B0.CC, tb96, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track3B1, tb96, param_B[1], RZ, 1; + +// writeAs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 7; +--:-:-:-:1 IADD writeBs, writeBs, shiftX; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readBs, readBs, tid96; +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb64, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb96, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.CI.64 load0B, [track0B]; +--:-:2:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:3:-:1 @P4 LDG.E.CI.64 load2B, [track2B]; +--:-:4:-:1 @P5 LDG.E.CI.64 load3B, [track3B]; +--:-:5:-:1 @P6 LDG.E.CI.64 loadA, [trackA]; + + + +--:-:6:-:1 @!P2 LDS.U.64 load0B, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 load1B, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.64 load2B, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.64 load3B, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.64 loadA, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidY1, tidY, 1; +--:-:-:-:1 IADD tidY2, tidY, 2; +--:-:-:-:1 IADD tidY3, tidY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P4; + +--:-:1:-:1 @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:1:-:1 @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:1:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:1:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb64, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb96, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P6; + +--:-:5:-:1 @P0 LDG.E.CI.U16 loadA0, [trackA + 2x<0>]; +--:-:5:-:1 @P1 LDG.E.CI.U16 loadA1, [trackA + 2x<1>]; +--:-:5:-:1 @P2 LDG.E.CI.U16 loadA2, [trackA + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 loadA3, [trackA + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb32, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P0, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P0; + + + + + our $vec; + return $vec ? q{ +21:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:1:-:1 F2F.F32.F16 load0B0, load0B0.H0; + +02:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + +04:-:-:-:1 F2F.F32.F16 load2B3, load2B1.H1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B1.H0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B0.H1; +--:-:3:-:1 F2F.F32.F16 load2B0, load2B0.H0; + +08:-:-:-:1 F2F.F32.F16 load3B3, load3B1.H1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B1.H0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B0.H1; +--:-:4:-:1 F2F.F32.F16 load3B0, load3B0.H0; + +10:-:-:-:1 F2F.F32.F16 loadA3, loadA1.H1; +--:-:-:-:1 F2F.F32.F16 loadA2, loadA1.H0; +--:-:-:-:1 F2F.F32.F16 loadA1, loadA0.H1; +--:-:5:-:1 F2F.F32.F16 loadA0, loadA0.H0; + } : q{ +21:-:-:-:1 F2F.F32.F16 load0B0, load0B0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:1:-:1 F2F.F32.F16 load0B3, load0B3; + +02:-:-:-:1 F2F.F32.F16 load1B0, load1B0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:2:-:1 F2F.F32.F16 load1B3, load1B3; + +04:-:-:-:1 F2F.F32.F16 load2B0, load2B0; +--:-:-:-:1 F2F.F32.F16 load2B1, load2B1; +--:-:-:-:1 F2F.F32.F16 load2B2, load2B2; +--:-:3:-:1 F2F.F32.F16 load2B3, load2B3; + +08:-:-:-:1 F2F.F32.F16 load3B0, load3B0; +--:-:-:-:1 F2F.F32.F16 load3B1, load3B1; +--:-:-:-:1 F2F.F32.F16 load3B2, load3B2; +--:-:4:-:1 F2F.F32.F16 load3B3, load3B3; + +10:-:-:-:1 F2F.F32.F16 loadA0, loadA0; +--:-:-:-:1 F2F.F32.F16 loadA1, loadA1; +--:-:-:-:1 F2F.F32.F16 loadA2, loadA2; +--:-:5:-:1 F2F.F32.F16 loadA3, loadA3; + }; + + +01:-:-:-:1 STS [writeBs + 4x<0*128 + 0*32>], load0B0; +--:-:-:-:0 IADD track0B0.CC, track0B0, 2x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 0*32>], load0B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 0*32>], load0B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 0*32>], load0B3; + +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +02:-:-:-:1 STS [writeBs + 4x<0*128 + 1*32>], load1B0; +--:-:-:-:0 IADD track1B0.CC, track1B0, 2x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 1*32>], load1B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 1*32>], load1B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 1*32>], load1B3; + +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +04:-:-:-:1 STS [writeBs + 4x<0*128 + 2*32>], load2B0; +--:-:-:-:0 IADD track2B0.CC, track2B0, 2x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 2*32>], load2B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 2*32>], load2B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 2*32>], load2B3; + +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +08:-:-:-:1 STS [writeBs + 4x<0*128 + 3*32>], load3B0; +--:-:-:-:0 IADD track3B0.CC, track3B0, 2x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 3*32>], load3B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 3*32>], load3B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 3*32>], load3B3; + +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +10:-:-:-:1 STS [writeAs + 4x<0*32>], loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 2x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*32>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*32>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*32>], loadA3; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.64 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.64 load1B, [track1B]; +--:-:5:-:1 @P4 LDG.E.CI.64 load2B, [track2B]; +--:-:5:-:1 @P5 LDG.E.CI.64 load3B, [track3B]; +--:-:6:-:1 @P6 LDG.E.CI.64 loadA, [trackA]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:5:-:1 @P4 LDG.E.CI.U16 load2B0, [track2B + 2x<0>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2B2, [track2B + 2x<2>]; +--:-:5:-:1 @P4 LDG.E.CI.U16 load2B3, [track2B + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 1; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 0*32>], load0B0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 0*32>], load0B1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 0*32>], load0B2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 0*32>], load0B3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 1*32>], load1B0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 1*32>], load1B1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 1*32>], load1B2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 1*32>], load1B3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 2*32>], load2B0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 2*32>], load2B1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 2*32>], load2B2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 2*32>], load2B3;\n", + + j9c6 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 3*32>], load3B0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 3*32>], load3B1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 3*32>], load3B2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 3*32>], load3B3;\n", + + j11c6 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<0*32>], loadA0;\n", + j11c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], loadA1;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], loadA2;\n", + j11c12 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<3*32>], loadA3;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0B0.CC, track0B0, 2x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0B1, track0B1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, 2x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2B0.CC, track2B0, 2x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2B1, track2B1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3B0.CC, track3B0, 2x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3B1, track3B1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackA0.CC, trackA0, 2x<16>;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackA1, trackA1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.64 load0B, [track0B];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1B, [track1B];\n", + j9c29 => "10:-:5:-:1 \@P4 LDG.E.CI.64 load2B, [track2B];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.64 load3B, [track3B];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.64 loadA, [trackA];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0B3, load0B1.H1;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B2, load0B1.H0;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B1, load0B0.H1;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1B3, load1B1.H1;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B2, load1B1.H0;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B1, load1B0.H1;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j6c13 => "10:-:-:-:1 \@P4 F2F.F32.F16 load2B3, load2B1.H1;\n", + j6c17 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2B2, load2B1.H0;\n", + j6c21 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2B1, load2B0.H1;\n", + j6c25 => "--:-:5:-:1 \@P4 F2F.F32.F16 load2B0, load2B0.H0;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B3, load3B1.H1;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B2, load3B1.H0;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B1, load3B0.H1;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3B0, load3B0.H0;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadA3, loadA1.H1;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA2, loadA1.H0;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA1, loadA0.H1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA0, loadA0.H0;\n", + ) : + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j3c31 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j4c1 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j5c31 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j6c1 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + + j9c29 => "10:-:5:-:1 \@P4 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n", + j9c31 => "--:-:5:-:1 \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n", + j10c1 => "--:-:5:-:1 \@P4 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n", + j10c3 => "--:-:5:-:1 \@P4 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n", + + j10c8 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n", + j10c10 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n", + j10c12 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n", + + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];\n", + j11c31 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];\n", + j12c1 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0B0, load0B0;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B1, load0B1;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0B2, load0B2;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0B3, load0B3;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1B0, load1B0;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B1, load1B1;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1B2, load1B2;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1B3, load1B3;\n", + + j6c13 => "10:-:-:-:1 \@P4 F2F.F32.F16 load2B0, load2B0;\n", + j6c17 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2B1, load2B1;\n", + j6c21 => "--:-:-:-:1 \@P4 F2F.F32.F16 load2B2, load2B2;\n", + j6c25 => "--:-:5:-:1 \@P4 F2F.F32.F16 load2B3, load2B3;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B0, load3B0;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B1, load3B1;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3B2, load3B2;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3B3, load3B3;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadA0, loadA0;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA1, loadA1;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadA2, loadA2;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadA3, loadA3;\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/hgemm_nt_32x32.sass b/Kernel/SGEMM/Pascal/hgemm_nt_32x32.sass new file mode 100644 index 0000000..1225d7d --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_nt_32x32.sass @@ -0,0 +1,1067 @@ +# Kernel: hgemm_nt_32x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 16x<32*65> + szShareA : (32*65) + szShareB : (32*65) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 0-63 : czero<00-63> + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 64-95 ~ tidX, tidY, tidY<1-3>, lda, ldb, ldaz, ldbz, lda16, ldb16, tid1, tid16, tid16_8, ta<00|16>, txa<00|16>, tb<00|16>, txb<00|16>, xmad_ta, xmad_tb, shiftX, predsY0, predsY4, partialK + + 96-127 : load0A<0-7>, load1A<0-7>, load0B<0-7>, load1B<0-7> + 128-135 : track0A<0-1>, track1A<0-1>, track0B<0-1>, track1B<0-1> + + 136-142 ~ swapBuf, readAs, readBs, writeAs, writeBs, k + 143-149 ~ tid, blkA, blkB, blkZ, writeCs, preds + + 0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3> + 64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3 + 64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7 + 96-99 : loadC<0-3> + 100-103 : b<0-3> + 104-107 : c<0-3> + 108-109 : C<0-1> + 110-142 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc16, readCs, alpha, beta, flags, tid7, tid8 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL lda16, lda, 4; +--:-:-:-:1 SHL ldb16, ldb, 4; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +] + +// tidX = tid >> 3 +// tidY = (tid & 7) << 3 +// shiftX = (tid & 7) << 2 +01:-:-:-:1 SHR.U32 tidX, tid, 3; +--:-:-:-:1 LOP.AND tidY, tid, 7; +--:-:-:-:1 SHL shiftX, tidY, 2; +--:-:-:-:1 SHL tidY, tidY, 3; + +// trackA += ((blkA*32 + tidX) * lda + tidY) * 2 +02:-:-:-:1 ISCADD txa00, blkA, tidX, 5; +--:-:-:-:1 IADD txa16, txa00, 16; +--:-:-:-:1 XMAD.LO ta00, lda, txa00, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta00, ldaz, blkZ, ta00; +--:-:-:-:1 IADD ta16, ta00, lda16; +--:-:-:-:1 LEA track0A0.CC, ta00, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta00, param_A[1], RZ, 1; +--:-:-:-:1 LEA track1A0.CC, ta16, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track1A1, ta16, param_A[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa16, param_m, PT; + +// trackB += ((blkB*32 + tidX) * ldb + tidY) * 2 +04:-:-:-:1 ISCADD txb00, blkB, tidX, 5; +--:-:-:-:1 IADD txb16, txb00, 16; +--:-:-:-:1 XMAD.LO tb00, ldb, txb00, tidY, xmad_tb; +--:-:-:-:1 XMAD.LO2 tb00, ldbz, blkZ, tb00; +--:-:-:-:1 IADD tb16, tb00, ldb16; +--:-:-:-:1 LEA track0B0.CC, tb00, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track0B1, tb00, param_B[1], RZ, 1; +--:-:-:-:1 LEA track1B0.CC, tb16, param_B[0], 1; +--:-:-:-:1 LEA.HI.X track1B1, tb16, param_B[1], RZ, 1; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb16, param_n, PT; + +--:-:-:-:1 P2R preds, PR, RZ, 0x3c; + +// writeAs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftX; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 5; +--:-:-:-:1 IADD writeBs, writeBs, shiftX; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + + +// readAs = (((tid & 8) >> 2) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 8; +--:-:-:-:1 SHR.U32 readAs, readAs, 2; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 3) << 4 +--:-:-:-:1 BFE.U32 readBs, tid, 0x201; // 2 bits at position 1 +--:-:-:-:1 SHL readBs, readBs, 4; + +// tid16 = tid & -16 +// tid16_8 = tid16 / 2 * 4 +--:-:-:-:1 LOP.AND tid16, tid, -16; +--:-:-:-:1 SHL tid16_8, tid16, 1; + +// writeCs = (readAs + tid16*4) * 32 + readBs; +--:-:-:-:1 ISCADD writeCs, tid16, readAs, 2; +--:-:-:-:1 ISCADD writeCs, writeCs, readBs, 5; + +// Each block of 16 threads works on 8 lines, shifted over by 4 +// readAs += tid16_8 * 32 + tid16 +// readBs += tid16_8 * 32 + tid16 + 4x +--:-:-:-:1 ISCADD readAs, tid16_8, readAs, 5; +--:-:-:-:1 ISCADD readBs, tid16_8, readBs, 5; +--:-:-:-:1 IADD readAs, tid16, readAs; +--:-:-:-:1 IADD3 readBs, tid16, 4x, readBs; + +--:-:-:-:1 MOV32I swapBuf, 4x; + +// If k is not a multiple of 64 we want to grab the partial amount on the first fetch. +// If it is a multiple of 64 then make a full 64 line fetch. +--:-:-:-:1 LOP.AND.Z P0, partialK, k, 63; +--:-:-:-:1 @P0 MOV partialK, 64; +--:-:-:-:1 IADD k, k, -partialK; +[+ + our $vec; + return $vec ? q{ + +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY, partialK, PT; +--:-:-:-:1 @P1 R2P PR, preds, 0x3c; +--:-:-:-:1 @!P1 R2P PR, RZ, 0x3c; + + +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.128 load1A, [track1A]; +--:-:4:-:1 @P4 LDG.E.CI.128 load0B, [track0B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load1B, [track1B]; + + + +--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:-:-:1 @!P3 LDS.U.128 load1A, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 load0B, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 load1B, [addr_zero]; + + + } : q{ +--:-:-:-:1 IADD tidY1, tidY, 1; +--:-:-:-:1 IADD tidY2, tidY, 2; +--:-:-:-:1 IADD tidY3, tidY, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, partialK, PT; +--:-:-:-:1 P2R predsY0, PR, RZ, 0x0f; + +--:-:-:-:1 IADD tidY, tidY, 4; +--:-:-:-:1 IADD tidY1, tidY1, 4; +--:-:-:-:1 IADD tidY2, tidY2, 4; +--:-:-:-:1 IADD tidY3, tidY3, 4; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, partialK, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, partialK, PT; +--:-:-:-:1 P2R predsY4, PR, RZ, 0x0f; + + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 @P4 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 @P4 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load0A4, RZ; +--:-:-:-:1 @!P1 MOV load0A5, RZ; +--:-:-:-:1 @!P2 MOV load0A6, RZ; +--:-:-:-:1 @!P3 MOV load0A7, RZ; + + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa16, param_m, PT; +--:-:-:-:1 @P5 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 @P5 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1A4, [track1A + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1A5, [track1A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1A6, [track1A + 2x<6>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load1A4, RZ; +--:-:-:-:1 @!P1 MOV load1A5, RZ; +--:-:-:-:1 @!P2 MOV load1A6, RZ; +--:-:-:-:1 @!P3 MOV load1A7, RZ; + + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb00, param_n, PT; +--:-:-:-:1 @P6 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 @P6 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load0B4, [track0B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load0B5, [track0B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0B6, [track0B + 2x<6>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load0B4, RZ; +--:-:-:-:1 @!P1 MOV load0B5, RZ; +--:-:-:-:1 @!P2 MOV load0B6, RZ; +--:-:-:-:1 @!P3 MOV load0B7, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb16, param_n, PT; +--:-:-:-:1 @P4 R2P PR, predsY0, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 @P4 R2P PR, predsY4, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 @P0 LDG.E.CI.U16 load1B4, [track1B + 2x<4>]; +--:-:-:-:1 @P1 LDG.E.CI.U16 load1B5, [track1B + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load1B6, [track1B + 2x<6>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 load1B7, [track1B + 2x<7>]; + +--:-:-:-:1 @!P0 MOV load1B4, RZ; +--:-:-:-:1 @!P1 MOV load1B5, RZ; +--:-:-:-:1 @!P2 MOV load1B6, RZ; +--:-:-:-:1 @!P3 MOV load1B7, RZ; + }; ++] +--:-:-:-:1 SHL partialK, partialK, 1; + +--:-:-:-:1 ISETP.GE.AND P0, PT, k, 64, PT; +--:-:-:-:1 IADD k, k, -64; +--:-:-:-:1 @P0 R2P PR, preds, 0x3c; +--:-:-:-:1 @!P0 R2P PR, RZ, 0x3c; + + +[+ + our $vec; + return $vec ? q{ +22:-:-:-:1 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A2.H1; +--:-:6:-:1 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0.H0; + } : q{ +02:-:-:-:1 F2F.F32.F16 load0A7, load0A7; +--:-:-:-:1 F2F.F32.F16 load0A6, load0A6; +--:-:-:-:1 F2F.F32.F16 load0A5, load0A5; +--:-:6:-:1 F2F.F32.F16 load0A4, load0A4; +--:-:-:-:1 F2F.F32.F16 load0A3, load0A3; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:2:-:1 F2F.F32.F16 load0A0, load0A0; + }; ++] +--:-:-:-:0 IADD track0A0.CC, track0A0, partialK; +20:-:-:-:1 STS [writeAs + 4x<7*32 + 0*16>], load0A7; +--:-:-:-:1 STS [writeAs + 4x<6*32 + 0*16>], load0A6; +--:-:-:-:1 STS [writeAs + 4x<5*32 + 0*16>], load0A5; +--:-:-:-:1 STS [writeAs + 4x<4*32 + 0*16>], load0A4; +02:-:-:-:1 STS [writeAs + 4x<3*32 + 0*16>], load0A3; +--:-:-:-:1 STS [writeAs + 4x<2*32 + 0*16>], load0A2; +--:-:-:-:1 STS [writeAs + 4x<1*32 + 0*16>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<0*32 + 0*16>], load0A0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +[+ + our $vec; + return $vec ? q{ +04:-:-:-:1 F2F.F32.F16 load1A7, load1A3.H1; +--:-:-:-:1 F2F.F32.F16 load1A6, load1A3.H0; +--:-:-:-:1 F2F.F32.F16 load1A5, load1A2.H1; +--:-:6:-:1 F2F.F32.F16 load1A4, load1A2.H0; +--:-:-:-:1 F2F.F32.F16 load1A3, load1A1.H1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A1.H0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A0.H1; +--:-:2:-:1 F2F.F32.F16 load1A0, load1A0.H0; + } : q{ +04:-:-:-:1 F2F.F32.F16 load1A7, load1A7; +--:-:-:-:1 F2F.F32.F16 load1A6, load1A6; +--:-:-:-:1 F2F.F32.F16 load1A5, load1A5; +--:-:6:-:1 F2F.F32.F16 load1A4, load1A4; +--:-:-:-:1 F2F.F32.F16 load1A3, load1A3; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A2; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A1; +--:-:2:-:1 F2F.F32.F16 load1A0, load1A0; + }; ++] +--:-:-:-:0 IADD track1A0.CC, track1A0, partialK; +20:-:-:-:1 STS [writeAs + 4x<7*32 + 1*16>], load1A7; +--:-:-:-:1 STS [writeAs + 4x<6*32 + 1*16>], load1A6; +--:-:-:-:1 STS [writeAs + 4x<5*32 + 1*16>], load1A5; +--:-:-:-:1 STS [writeAs + 4x<4*32 + 1*16>], load1A4; +02:-:-:-:1 STS [writeAs + 4x<3*32 + 1*16>], load1A3; +--:-:-:-:1 STS [writeAs + 4x<2*32 + 1*16>], load1A2; +--:-:-:-:1 STS [writeAs + 4x<1*32 + 1*16>], load1A1; +--:-:-:-:1 STS [writeAs + 4x<0*32 + 1*16>], load1A0; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +[+ + our $vec; + return $vec ? q{ +08:-:-:-:1 F2F.F32.F16 load0B7, load0B3.H1; +--:-:-:-:1 F2F.F32.F16 load0B6, load0B3.H0; +--:-:-:-:1 F2F.F32.F16 load0B5, load0B2.H1; +--:-:6:-:1 F2F.F32.F16 load0B4, load0B2.H0; +--:-:-:-:1 F2F.F32.F16 load0B3, load0B1.H1; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B1.H0; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B0.H1; +--:-:2:-:1 F2F.F32.F16 load0B0, load0B0.H0; + } : q{ +08:-:-:-:1 F2F.F32.F16 load0B7, load0B7; +--:-:-:-:1 F2F.F32.F16 load0B6, load0B6; +--:-:-:-:1 F2F.F32.F16 load0B5, load0B5; +--:-:6:-:1 F2F.F32.F16 load0B4, load0B4; +--:-:-:-:1 F2F.F32.F16 load0B3, load0B3; +--:-:-:-:1 F2F.F32.F16 load0B2, load0B2; +--:-:-:-:1 F2F.F32.F16 load0B1, load0B1; +--:-:2:-:1 F2F.F32.F16 load0B0, load0B0; + }; ++] +--:-:-:-:0 IADD track0B0.CC, track0B0, partialK; +20:-:-:-:1 STS [writeBs + 4x<7*32 + 0*16>], load0B7; +--:-:-:-:1 STS [writeBs + 4x<6*32 + 0*16>], load0B6; +--:-:-:-:1 STS [writeBs + 4x<5*32 + 0*16>], load0B5; +--:-:-:-:1 STS [writeBs + 4x<4*32 + 0*16>], load0B4; +02:-:-:-:1 STS [writeBs + 4x<3*32 + 0*16>], load0B3; +--:-:-:-:1 STS [writeBs + 4x<2*32 + 0*16>], load0B2; +--:-:-:-:1 STS [writeBs + 4x<1*32 + 0*16>], load0B1; +--:-:-:-:1 STS [writeBs + 4x<0*32 + 0*16>], load0B0; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +[+ + our $vec; + return $vec ? q{ +10:-:-:-:1 F2F.F32.F16 load1B7, load1B3.H1; +--:-:-:-:1 F2F.F32.F16 load1B6, load1B3.H0; +--:-:-:-:1 F2F.F32.F16 load1B5, load1B2.H1; +--:-:6:-:1 F2F.F32.F16 load1B4, load1B2.H0; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B1.H1; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B1.H0; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B0.H1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0.H0; + } : q{ +10:-:-:-:1 F2F.F32.F16 load1B7, load1B7; +--:-:-:-:1 F2F.F32.F16 load1B6, load1B6; +--:-:-:-:1 F2F.F32.F16 load1B5, load1B5; +--:-:6:-:1 F2F.F32.F16 load1B4, load1B4; +--:-:-:-:1 F2F.F32.F16 load1B3, load1B3; +--:-:-:-:1 F2F.F32.F16 load1B2, load1B2; +--:-:-:-:1 F2F.F32.F16 load1B1, load1B1; +--:-:2:-:1 F2F.F32.F16 load1B0, load1B0; + }; ++] +--:-:-:-:0 IADD track1B0.CC, track1B0, partialK; +20:-:-:-:1 STS [writeBs + 4x<7*32 + 1*16>], load1B7; +--:-:-:-:1 STS [writeBs + 4x<6*32 + 1*16>], load1B6; +--:-:-:-:1 STS [writeBs + 4x<5*32 + 1*16>], load1B5; +--:-:-:-:1 STS [writeBs + 4x<4*32 + 1*16>], load1B4; +02:-:-:-:1 STS [writeBs + 4x<3*32 + 1*16>], load1B3; +--:-:-:-:1 STS [writeBs + 4x<2*32 + 1*16>], load1B2; +--:-:-:-:1 STS [writeBs + 4x<1*32 + 1*16>], load1B1; +--:-:-:-:1 STS [writeBs + 4x<0*32 + 1*16>], load1B0; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00>]; +--:-:-:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*32 + 16>]; + +[+ + our $vec; + return $vec ? q{ +--:-:2:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P3 LDG.E.CI.128 load1A, [track1A]; +--:-:4:-:1 @P4 LDG.E.CI.128 load0B, [track0B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load1B, [track1B]; + } : q{ +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>]; +--:-:-:-:1 @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>]; + +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A4, [track1A + 2x<4>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A5, [track1A + 2x<5>]; +--:-:-:-:1 @P3 LDG.E.CI.U16 load1A6, [track1A + 2x<6>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>]; + +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B0, [track0B + 2x<0>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B2, [track0B + 2x<2>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B3, [track0B + 2x<3>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B4, [track0B + 2x<4>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B5, [track0B + 2x<5>]; +--:-:-:-:1 @P4 LDG.E.CI.U16 load0B6, [track0B + 2x<6>]; +--:-:4:-:1 @P4 LDG.E.CI.U16 load0B7, [track0B + 2x<7>]; + +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B0, [track1B + 2x<0>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B1, [track1B + 2x<1>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B3, [track1B + 2x<3>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B4, [track1B + 2x<4>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B5, [track1B + 2x<5>]; +--:-:-:-:1 @P5 LDG.E.CI.U16 load1B6, [track1B + 2x<6>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load1B7, [track1B + 2x<7>]; + }; ++] + +LOOP: + +[+ + our $vec; + our %insert = + ( + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, RZ, PT;\n", + j0c10 => "--:-:-:-:1 ISETP.GE.AND P1, PT, k, 64, PT;\n" . + "--:-:-:-:1 IADD k, k, -64;\n", + + j0c23 => "--:-:-:-:1 \@P1 R2P PR, preds, 0x3c;\n", + j0c24 => "--:-:-:-:1 \@!P1 R2P PR, RZ, 0x3c;\n", + + j3c32 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 2x<64>;\n", + j3c37 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j4c32 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 2x<64>;\n", + j4c37 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j5c32 => "--:-:-:-:1 \@P4 IADD track0B0.CC, track0B0, 2x<64>;\n", + j5c37 => "--:-:-:-:1 \@P4 IADD.X track0B1, track0B1, RZ;\n", + j6c32 => "--:-:-:-:1 \@P5 IADD track1B0.CC, track1B0, 2x<64>;\n", + j6c37 => "--:-:-:-:1 \@P5 IADD.X track1B1, track1B1, RZ;\n", + + j6c63 => "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j2c45 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n", + j2c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n", + j2c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n", + j2c57 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n", + j2c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n", + j3c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n", + j3c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n", + j3c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j3c45 => "04:-:-:-:1 \@P0 F2F.F32.F16 load1A7, load1A3.H1;\n", + j3c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A6, load1A3.H0;\n", + j3c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A5, load1A2.H1;\n", + j3c57 => "--:-:3:-:1 \@P0 F2F.F32.F16 load1A4, load1A2.H0;\n", + j3c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A3, load1A1.H1;\n", + j4c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A2, load1A1.H0;\n", + j4c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A1, load1A0.H1;\n", + j4c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load1A0, load1A0.H0;\n", + + j4c45 => "08:-:-:-:1 \@P0 F2F.F32.F16 load0B7, load0B3.H1;\n", + j4c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B6, load0B3.H0;\n", + j4c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B5, load0B2.H1;\n", + j4c57 => "--:-:4:-:1 \@P0 F2F.F32.F16 load0B4, load0B2.H0;\n", + j4c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n", + j5c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n", + j5c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n", + j5c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n", + + j5c45 => "10:-:-:-:1 \@P0 F2F.F32.F16 load1B7, load1B3.H1;\n", + j5c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B6, load1B3.H0;\n", + j5c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B5, load1B2.H1;\n", + j5c57 => "--:-:5:-:1 \@P0 F2F.F32.F16 load1B4, load1B2.H0;\n", + j5c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n", + j6c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n", + j6c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n", + j6c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n", + + j3c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<7*32 + 0*16>], load0A7;\n", + j3c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32 + 0*16>], load0A6;\n", + j3c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32 + 0*16>], load0A5;\n", + j3c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*32 + 0*16>], load0A4;\n", + j3c24 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<3*32 + 0*16>], load0A3;\n", + j3c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32 + 0*16>], load0A2;\n", + j3c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32 + 0*16>], load0A1;\n", + j3c30 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<0*32 + 0*16>], load0A0;\n", + + j4c16 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<7*32 + 1*16>], load1A7;\n", + j4c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32 + 1*16>], load1A6;\n", + j4c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32 + 1*16>], load1A5;\n", + j4c22 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<4*32 + 1*16>], load1A4;\n", + j4c24 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<3*32 + 1*16>], load1A3;\n", + j4c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32 + 1*16>], load1A2;\n", + j4c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32 + 1*16>], load1A1;\n", + j4c30 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<0*32 + 1*16>], load1A0;\n", + + j5c16 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<7*32 + 0*16>], load0B7;\n", + j5c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*32 + 0*16>], load0B6;\n", + j5c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*32 + 0*16>], load0B5;\n", + j5c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*32 + 0*16>], load0B4;\n", + j5c24 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<3*32 + 0*16>], load0B3;\n", + j5c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*32 + 0*16>], load0B2;\n", + j5c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*32 + 0*16>], load0B1;\n", + j5c30 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<0*32 + 0*16>], load0B0;\n", + + j6c16 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<7*32 + 1*16>], load1B7;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*32 + 1*16>], load1B6;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*32 + 1*16>], load1B5;\n", + j6c22 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<4*32 + 1*16>], load1B4;\n", + j6c24 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<3*32 + 1*16>], load1B3;\n", + j6c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*32 + 1*16>], load1B2;\n", + j6c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*32 + 1*16>], load1B1;\n", + j6c30 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<0*32 + 1*16>], load1B0;\n", + + j3c62 => "02:-:2:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j4c62 => "04:-:3:-:1 \@P3 LDG.E.CI.128 load1A, [track1A];\n", + j5c62 => "08:-:4:-:1 \@P4 LDG.E.CI.128 load0B, [track0B];\n", + j6c62 => "10:-:5:-:1 \@P5 LDG.E.CI.128 load1B, [track1B];\n", + ) : + ( + j2c45 => "02:-:-:-:1 \@P0 F2F.F32.F16 load0A0, load0A0;\n", + j2c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A1, load0A1;\n", + j2c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A2, load0A2;\n", + j2c57 => "--:-:2:-:1 \@P0 F2F.F32.F16 load0A3, load0A3;\n", + j2c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A4, load0A4;\n", + j3c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A5, load0A5;\n", + j3c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0A6, load0A6;\n", + j3c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load0A7, load0A7;\n", + + j3c45 => "04:-:-:-:1 \@P0 F2F.F32.F16 load1A0, load1A0;\n", + j3c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A1, load1A1;\n", + j3c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A2, load1A2;\n", + j3c57 => "--:-:3:-:1 \@P0 F2F.F32.F16 load1A3, load1A3;\n", + j3c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A4, load1A4;\n", + j4c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A5, load1A5;\n", + j4c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1A6, load1A6;\n", + j4c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load1A7, load1A7;\n", + + j4c45 => "08:-:-:-:1 \@P0 F2F.F32.F16 load0B0, load0B0;\n", + j4c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B1, load0B1;\n", + j4c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B2, load0B2;\n", + j4c57 => "--:-:4:-:1 \@P0 F2F.F32.F16 load0B3, load0B3;\n", + j4c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B4, load0B4;\n", + j5c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B5, load0B5;\n", + j5c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load0B6, load0B6;\n", + j5c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load0B7, load0B7;\n", + + j5c45 => "10:-:-:-:1 \@P0 F2F.F32.F16 load1B0, load1B0;\n", + j5c49 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B1, load1B1;\n", + j5c53 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B2, load1B2;\n", + j5c57 => "--:-:5:-:1 \@P0 F2F.F32.F16 load1B3, load1B3;\n", + j5c61 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B4, load1B4;\n", + j6c1 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B5, load1B5;\n", + j6c5 => "--:-:-:-:1 \@P0 F2F.F32.F16 load1B6, load1B6;\n", + j6c9 => "--:-:6:-:1 \@P0 F2F.F32.F16 load1B7, load1B7;\n", + + j3c16 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<0*32 + 0*16>], load0A0;\n", + j3c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32 + 0*16>], load0A1;\n", + j3c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32 + 0*16>], load0A2;\n", + j3c22 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<3*32 + 0*16>], load0A3;\n", + j3c24 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<4*32 + 0*16>], load0A4;\n", + j3c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32 + 0*16>], load0A5;\n", + j3c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32 + 0*16>], load0A6;\n", + j3c30 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<7*32 + 0*16>], load0A7;\n", + + j4c16 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*32 + 1*16>], load1A0;\n", + j4c18 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32 + 1*16>], load1A1;\n", + j4c20 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32 + 1*16>], load1A2;\n", + j4c22 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<3*32 + 1*16>], load1A3;\n", + j4c24 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<4*32 + 1*16>], load1A4;\n", + j4c26 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<5*32 + 1*16>], load1A5;\n", + j4c28 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<6*32 + 1*16>], load1A6;\n", + j4c30 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<7*32 + 1*16>], load1A7;\n", + + j5c16 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<0*32 + 0*16>], load0B0;\n", + j5c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*32 + 0*16>], load0B1;\n", + j5c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*32 + 0*16>], load0B2;\n", + j5c22 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<3*32 + 0*16>], load0B3;\n", + j5c24 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<4*32 + 0*16>], load0B4;\n", + j5c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*32 + 0*16>], load0B5;\n", + j5c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*32 + 0*16>], load0B6;\n", + j5c30 => "--:6:-:-:1 \@P0 STS [writeBs + 4x<7*32 + 0*16>], load0B7;\n", + + j6c16 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*32 + 1*16>], load1B0;\n", + j6c18 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*32 + 1*16>], load1B1;\n", + j6c20 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*32 + 1*16>], load1B2;\n", + j6c22 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<3*32 + 1*16>], load1B3;\n", + j6c24 => "20:-:-:-:1 \@P0 STS [writeBs + 4x<4*32 + 1*16>], load1B4;\n", + j6c26 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<5*32 + 1*16>], load1B5;\n", + j6c28 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<6*32 + 1*16>], load1B6;\n", + j6c30 => "--:6:-:-:1 \@P0 STS [writeBs + 4x<7*32 + 1*16>], load1B7;\n", + + j3c48 => "02:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j3c50 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j3c52 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j3c54 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + j3c56 => "20:-:-:-:1 \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n", + j3c58 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n", + j3c60 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n", + j3c62 => "--:-:2:-:1 \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n", + + j4c48 => "04:-:-:-:1 \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n", + j4c50 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n", + j4c52 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n", + j4c54 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n", + j4c56 => "20:-:-:-:1 \@P3 LDG.E.CI.U16 load1A4, [track1A + 2x<4>];\n", + j4c58 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A5, [track1A + 2x<5>];\n", + j4c60 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A6, [track1A + 2x<6>];\n", + j4c62 => "--:-:3:-:1 \@P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>];\n", + + j5c48 => "08:-:-:-:1 \@P4 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n", + j5c50 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n", + j5c52 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n", + j5c54 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n", + j5c56 => "20:-:-:-:1 \@P4 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];\n", + j5c58 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];\n", + j5c60 => "--:-:-:-:1 \@P4 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];\n", + j5c62 => "--:-:4:-:1 \@P4 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];\n", + + j6c48 => "10:-:-:-:1 \@P5 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n", + j6c50 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n", + j6c52 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n", + j6c54 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n", + j6c56 => "20:-:-:-:1 \@P5 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];\n", + j6c58 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];\n", + j6c60 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];\n", + j6c62 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];\n", + ) + ), + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ); + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + my $out = ''; + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $stall = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $wait = $c == 0 ? '01' : '--'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; ++] + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// readCs = ((tid & 7) * 4 + (tid / 8) * 32) * 4 +--:-:-:-:1 LOP.AND tid7, tid, 7; +--:-:-:-:1 SHR.U32 tid8, tid, 3; +--:-:-:-:1 SHL tid7, tid7, 2; +--:-:-:-:1 ISCADD readCs, tid8, tid7, 5; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid7; +--:-:-:-:1 ISCADD cx, blkB, tid7, 5; +--:-:-:-:1 IADD cx1, cx, 1; +--:-:-:-:1 IADD cx2, cx, 2; +--:-:-:-:1 IADD cx3, cx, 3; + +// cy = blkA*32 + tid8 +--:-:-:-:1 ISCADD cy, blkA, tid8, 5; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 SHL ldc16, ldc, 5; + +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C0.CC, ci, param_C[0], 1; +--:-:-:-:1 LEA.HI.X C1, ci, param_C[1], RZ, 1; + +// P0 = cx < n +--:-:-:-:1 ISETP.LT.AND P0, PT, cx, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P1, PT, cx1, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P2, PT, cx2, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, cx3, param_n, PT; +--:-:-:-:1 P2R preds, PR, RZ, 0x0f; + +// P4 = cy < m +--:-:-:-:1 ISETP.LT.AND P4, PT, cy, param_m, PT; + +// P5 = beta != 0 && P4 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P4; + +// P6 = Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; + +// Init beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + + + +--:-:-:-:1 FMUL shuffle_x0y0, cx0y0, alpha; +--:-:-:-:1 FMUL shuffle_x1y0, cx1y0, alpha; +--:-:-:-:1 FMUL shuffle_x2y0, cx2y0, alpha; +--:-:-:-:1 FMUL shuffle_x3y0, cx3y0, alpha; +--:-:-:-:1 FMUL shuffle_x4y0, cx4y0, alpha; +--:-:-:-:1 FMUL shuffle_x5y0, cx5y0, alpha; +--:-:-:-:1 FMUL shuffle_x6y0, cx6y0, alpha; +--:-:-:-:0 FMUL shuffle_x7y0, cx7y0, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32 + 00>], shuffle_x0y0; +--:-:-:-:1 FMUL shuffle_x0y1, cx0y1, alpha; +--:-:-:-:1 FMUL shuffle_x1y1, cx1y1, alpha; +--:-:-:-:1 FMUL shuffle_x2y1, cx2y1, alpha; +--:-:-:-:0 FMUL shuffle_x3y1, cx3y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32 + 16>], shuffle_x4y0; +--:-:-:-:1 FMUL shuffle_x4y1, cx4y1, alpha; +--:-:-:-:1 FMUL shuffle_x5y1, cx5y1, alpha; +--:-:-:-:1 FMUL shuffle_x6y1, cx6y1, alpha; +--:-:-:-:0 FMUL shuffle_x7y1, cx7y1, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*32 + 00>], shuffle_x0y1; +--:-:-:-:1 FMUL shuffle_x0y2, cx0y2, alpha; +--:-:-:-:1 FMUL shuffle_x1y2, cx1y2, alpha; +--:-:-:-:1 FMUL shuffle_x2y2, cx2y2, alpha; +--:-:-:-:0 FMUL shuffle_x3y2, cx3y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*32 + 16>], shuffle_x4y1; +--:-:-:-:1 FMUL shuffle_x4y2, cx4y2, alpha; +--:-:-:-:1 FMUL shuffle_x5y2, cx5y2, alpha; +--:-:-:-:1 FMUL shuffle_x6y2, cx6y2, alpha; +--:-:-:-:0 FMUL shuffle_x7y2, cx7y2, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*32 + 00>], shuffle_x0y2; +--:-:-:-:1 FMUL shuffle_x0y3, cx0y3, alpha; +--:-:-:-:1 FMUL shuffle_x1y3, cx1y3, alpha; +--:-:-:-:1 FMUL shuffle_x2y3, cx2y3, alpha; +--:-:-:-:0 FMUL shuffle_x3y3, cx3y3, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*32 + 16>], shuffle_x4y2; +--:-:-:-:1 FMUL shuffle_x4y3, cx4y3, alpha; +--:-:-:-:1 FMUL shuffle_x5y3, cx5y3, alpha; +--:-:-:-:1 FMUL shuffle_x6y3, cx6y3, alpha; +--:-:-:-:0 FMUL shuffle_x7y3, cx7y3, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*32 + 00>], shuffle_x0y3; +--:-:-:-:1 STS.128 [writeCs+4x<3*32 + 16>], shuffle_x4y3; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:1 FMUL shuffle_x0y4, cx0y4, alpha; +--:-:-:-:1 FMUL shuffle_x1y4, cx1y4, alpha; +--:-:-:-:1 FMUL shuffle_x2y4, cx2y4, alpha; +--:-:-:-:1 FMUL shuffle_x3y4, cx3y4, alpha; +--:-:-:-:1 FMUL shuffle_x4y4, cx4y4, alpha; +--:-:-:-:1 FMUL shuffle_x5y4, cx5y4, alpha; +--:-:-:-:0 FMUL shuffle_x6y4, cx6y4, alpha; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 FMUL shuffle_x7y4, cx7y4, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32 + 00>], shuffle_x0y4; +--:-:-:-:1 FMUL shuffle_x0y5, cx0y5, alpha; +--:-:-:-:1 FMUL shuffle_x1y5, cx1y5, alpha; +--:-:-:-:1 FMUL shuffle_x2y5, cx2y5, alpha; +--:-:-:-:0 FMUL shuffle_x3y5, cx3y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<0*32 + 16>], shuffle_x4y4; +--:-:-:-:1 FMUL shuffle_x4y5, cx4y5, alpha; +--:-:-:-:1 FMUL shuffle_x5y5, cx5y5, alpha; +--:-:-:-:1 FMUL shuffle_x6y5, cx6y5, alpha; +--:-:-:-:0 FMUL shuffle_x7y5, cx7y5, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*32 + 00>], shuffle_x0y5; +--:-:-:-:1 FMUL shuffle_x0y6, cx0y6, alpha; +--:-:-:-:1 FMUL shuffle_x1y6, cx1y6, alpha; +--:-:-:-:1 FMUL shuffle_x2y6, cx2y6, alpha; +--:-:-:-:0 FMUL shuffle_x3y6, cx3y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<1*32 + 16>], shuffle_x4y5; +--:-:-:-:1 FMUL shuffle_x4y6, cx4y6, alpha; +--:-:-:-:1 FMUL shuffle_x5y6, cx5y6, alpha; +--:-:-:-:1 FMUL shuffle_x6y6, cx6y6, alpha; +--:-:-:-:0 FMUL shuffle_x7y6, cx7y6, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*32 + 00>], shuffle_x0y6; +--:-:-:-:1 FMUL shuffle_x0y7, cx0y7, alpha; +--:-:-:-:1 FMUL shuffle_x1y7, cx1y7, alpha; +--:-:-:-:1 FMUL shuffle_x2y7, cx2y7, alpha; +--:-:-:-:0 FMUL shuffle_x3y7, cx3y7, alpha; +--:-:-:-:1 STS.128 [writeCs+4x<2*32 + 16>], shuffle_x4y6; +--:-:-:-:1 FMUL shuffle_x4y7, cx4y7, alpha; +--:-:-:-:1 FMUL shuffle_x5y7, cx5y7, alpha; +--:-:-:-:1 FMUL shuffle_x6y7, cx6y7, alpha; +--:-:-:-:0 FMUL shuffle_x7y7, cx7y7, alpha; +--:-:-:-:4 STS.128 [writeCs+4x<3*32 + 00>], shuffle_x0y7; +--:-:-:-:1 STS.128 [writeCs+4x<3*32 + 16>], shuffle_x4y7; +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:5 CAL STORE_C; + +--:-:-:-:5 EXIT; + +STORE_C: + +[+ + our $vec; + return $vec ? q{ +--:-:1:-:1 @P0 LDG.E.64 loadC, [C]; + } : q{ +--:-:-:-:0 @!P0 MOV loadC0, RZ; +--:-:-:-:1 @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>]; +--:-:-:-:0 @!P1 MOV loadC1, RZ; +--:-:-:-:1 @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>]; +--:-:-:-:0 @!P2 MOV loadC2, RZ; +--:-:-:-:1 @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>]; +--:-:-:-:0 @!P3 MOV loadC3, RZ; +--:-:1:-:1 @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>]; + }; ++] + +// Restore output preds +--:-:-:-:1 @P4 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f; + +--:-:-:-:1 LDS.U.128 part0C, [readCs + 4x<0*16*32>]; +--:-:2:-:1 LDS.U.128 part1C, [readCs + 4x<1*16*32>]; +--:-:-:-:1 LDS.U.128 part2C, [readCs + 4x<2*16*32>]; +--:-:3:-:1 LDS.U.128 part3C, [readCs + 4x<3*16*32>]; +--:-:-:-:1 LDS.U.128 part4C, [readCs + 4x<4*16*32>]; +--:-:4:-:1 LDS.U.128 part5C, [readCs + 4x<5*16*32>]; +--:-:-:-:1 LDS.U.128 part6C, [readCs + 4x<6*16*32>]; +--:-:5:-:1 LDS.U.128 part7C, [readCs + 4x<7*16*32>]; + + +02:-:-:-:1 @P0 FADD part0C0, part0C0, part1C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part1C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part1C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part1C3; + +04:-:-:-:1 @P0 FADD part2C0, part2C0, part3C0; +--:-:-:-:1 @P1 FADD part2C1, part2C1, part3C1; +--:-:-:-:1 @P2 FADD part2C2, part2C2, part3C2; +--:-:-:-:1 @P3 FADD part2C3, part2C3, part3C3; + +08:-:-:-:1 @P0 FADD part4C0, part4C0, part5C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part5C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part5C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part5C3; + +10:-:-:-:1 @P0 FADD part6C0, part6C0, part7C0; +--:-:-:-:1 @P1 FADD part6C1, part6C1, part7C1; +--:-:-:-:1 @P2 FADD part6C2, part6C2, part7C2; +--:-:-:-:1 @P3 FADD part6C3, part6C3, part7C3; + +--:-:-:-:1 @P0 FADD part0C0, part0C0, part2C0; +--:-:-:-:1 @P1 FADD part0C1, part0C1, part2C1; +--:-:-:-:1 @P2 FADD part0C2, part0C2, part2C2; +--:-:-:-:1 @P3 FADD part0C3, part0C3, part2C3; + +--:-:-:-:1 @P0 FADD part4C0, part4C0, part6C0; +--:-:-:-:1 @P1 FADD part4C1, part4C1, part6C1; +--:-:-:-:1 @P2 FADD part4C2, part4C2, part6C2; +--:-:-:-:1 @P3 FADD part4C3, part4C3, part6C3; + +--:-:-:-:1 @P0 FADD c0, part0C0, part4C0; +--:-:-:-:1 @P1 FADD c1, part0C1, part4C1; +--:-:-:-:1 @P2 FADD c2, part0C2, part4C2; +--:-:-:-:1 @P3 FADD c3, part0C3, part4C3; + + +--:-:-:-:0 IADD cy, cy, 16; + +[+ + our $vec; + return $vec ? q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0.H0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC0.H1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC1.H0; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC1.H1; + } : q{ +01:-:1:-:1 @P5 F2F.F32.F16 b0, loadC0; +--:-:2:-:1 @P5 F2F.F32.F16 b1, loadC1; +--:-:3:-:1 @P5 F2F.F32.F16 b2, loadC2; +--:-:4:-:1 @P5 F2F.F32.F16 b3, loadC3; + }; ++] + +01:-:-:-:1 @P5 FFMA c0, b0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, b1, beta, c1; +04:-:-:-:1 @P5 FFMA c2, b2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, b3, beta, c3; + +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:0 ISETP.LT.AND P5, PT, cy, param_m, P5; + +--:-:1:-:1 @P0 F2F.F16.F32 c0, c0; +--:-:2:-:1 @P1 F2F.F16.F32 c1, c1; + +--:-:-:-:0 ISETP.LT.AND P4, PT, cy, param_m, PT; + +--:-:3:-:1 @P2 F2F.F16.F32 c2, c2; +--:-:4:-:1 @P3 F2F.F16.F32 c3, c3; + +[+ + our $vec; + return $vec ? q{ +03:-:-:-:2 @P0 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 @P0 BFI c1, c3, 0x1010, c2; + +--:1:-:-:1 @P0 STG.E.CG.64 [C], c; + } : q{ +01:-:-:-:1 @P0 STG.E.U16 [C + 2x<0>], c0; +02:-:-:-:1 @P1 STG.E.U16 [C + 2x<1>], c1; +04:-:-:-:1 @P2 STG.E.U16 [C + 2x<2>], c2; +08:1:-:-:1 @P3 STG.E.U16 [C + 2x<3>], c3; + }; ++] + +// Restore beta preds +--:-:-:-:1 @P5 R2P PR, preds, 0x0f; +--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f; + +01:-:-:-:6 IADD C0.CC, C0, ldc16; +--:-:-:-:0 IADD.X C1, C1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_tn_128x128.sass b/Kernel/SGEMM/Pascal/hgemm_tn_128x128.sass new file mode 100644 index 0000000..c2beee1 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_tn_128x128.sass @@ -0,0 +1,360 @@ +# Kernel: hgemm_tn_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +[- + +our $int16; + +our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16'; + +sub convert_in {return $convert;} + + +sub int16_params { + return $int16 ? q{ +param_Stats[0] : c[0x0][0x190] +param_Stats[1] : c[0x0][0x194] +param_scale : c[0x0][0x198] + } : ""; +} +-] + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + [+ int16_params() +] + + + + + 64-95 ~ lda, ldb, ldaz, ldbz, tid1, tid7, tid31, tid128, tid15, tidX, blk, x<1-3>, y<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-103 : loadA<0-3>, loadB<0-3> + + 104-107 : trackA<0-1>, trackB<0-1> + + 108-118 ~ writeS, k, txa, txb, tidY, ta, tb, loop + 119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ + + 64-75 ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128 + + 64-79 : c<0-7>, d3, d2, d1, d0, cs<0-3> + 64-65 : Stats<0-1> + 80-89 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 90-118 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV loop, RZ; + +--:-:-:-:1 STS.128 [addr_zero], RZ; +[+ + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); ++] + +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND tid128, tid, 128; + +// tidX = (tid & 31) << 2 +// tidY = (tid >> 5) & 7 +01:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL tidX, tid31, 2; +--:-:-:-:1 BFE.U32 tidY, tid, 0x305; // 3 bits at position 5 + +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 4; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// trackA += (blkA*128 + lda*tidY + tidX) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// trackB += (blkB*128 + ldb*tidY + tidX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidX, 7; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeS, writeS, 4x<128*8*2>, 2; + + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + + + +[+ + our $vec; + return $vec ? q{ + +// doLoad = tidY < k && txa|txb < n|m +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P6; + +--:-:2:-:1 @P2 LDG.E.CI.64 loadA, [trackA]; +--:-:3:-:1 @P3 LDG.E.CI.64 loadB, [trackB]; + +--:-:5:-:1 @!P2 LDS.U.64 loadA, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 loadB, [addr_zero]; + + // Vec 4 and scalar loads + } : q{ + +// doLoadA = tidY < k && txa < m +// doLoadB = tidY < k && txb < n +--:-:-:-:1 IADD x1, txa, 1; +--:-:-:-:1 IADD x2, txa, 2; +--:-:-:-:1 IADD x3, txa, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_m, P0; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<00 + 0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<00 + 1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<00 + 2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 IADD y1, txb, 1; +--:-:-:-:1 IADD y2, txb, 2; +--:-:-:-:1 IADD y3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_n, P0; + +--:-:3:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>]; +--:-:3:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>]; +--:-:3:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>]; +--:-:3:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + }; ++] + + + +[+ + our $vec; + our $convert; + return $vec ? qq{ +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 7; + +12:-:-:-:4 $convert loadA3, loadA1.H1; +--:-:-:-:0 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:4 $convert loadA2, loadA1.H0; +--:-:-:-:4 $convert loadA1, loadA0.H1; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; +--:-:2:-:2 $convert loadA0, loadA0.H0; + +02:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadA; + +24:-:-:-:4 $convert loadB3, loadB1.H1; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 $convert loadB2, loadB1.H0; +--:-:-:-:4 $convert loadB1, loadB0.H1; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; +--:-:3:-:2 $convert loadB0, loadB0.H0; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; + +04:-:-:-:1 STS.128 [writeS + 4x<8*128>], loadB; + + // scalar loads + } : qq{ +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +02:-:-:-:4 $convert loadA0, loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:4 $convert loadA1, loadA1; +--:-:-:-:4 $convert loadA2, loadA2; +--:-:2:-:2 $convert loadA3, loadA3; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadA0; + +04:-:-:-:4 $convert loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 $convert loadB1, loadB1; +--:-:-:-:4 $convert loadB2, loadB2; +--:-:3:-:2 $convert loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +04:-:-:-:1 STS.128 [writeS + 4x<8*128>], loadB0; + + }; ++] + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<128*8*2>; + + +[+ + our $vec; + our $convert; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n"); + + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + ($vec ? + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI.64 loadA0, [trackA];\n", + j0c13 => "--:-:3:-:1 \@P3 LDG.E.CI.64 loadB0, [trackB];\n", + + j5c1 => "02:-:-:-:1 \@P2 $convert loadA3, loadA1.H1;\n", + j5c5 => "--:-:-:-:1 \@P2 $convert loadA2, loadA1;\n", + j5c9 => "--:-:-:-:1 \@P2 $convert loadA1, loadA0.H1;\n", + j5c13 => "--:-:2:-:1 \@P2 $convert loadA0, loadA0;\n", + + j6c1 => "04:-:-:-:1 \@P3 $convert loadB3, loadB1.H1;\n", + j6c5 => "--:-:-:-:1 \@P3 $convert loadB2, loadB1;\n", + j6c9 => "--:-:-:-:1 \@P3 $convert loadB1, loadB0.H1;\n", + j6c13 => "--:-:3:-:1 \@P3 $convert loadB0, loadB0;\n", + ) : + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n", + j0c12 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n", + j0c14 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n", + j0c16 => "--:-:2:-:1 \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n", + + j0c29 => "--:-:3:-:1 \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j0c31 => "--:-:3:-:1 \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j0c33 => "--:-:3:-:1 \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j0c35 => "--:-:3:-:1 \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j5c1 => "02:-:-:-:1 \@P2 $convert loadA0, loadA0;\n", + j5c5 => "--:-:-:-:1 \@P2 $convert loadA1, loadA1;\n", + j5c9 => "--:-:-:-:1 \@P2 $convert loadA2, loadA2;\n", + j5c13 => "--:-:2:-:1 \@P2 $convert loadA3, loadA3;\n", + + j6c1 => "04:-:-:-:1 \@P3 $convert loadB0, loadB0;\n", + j6c5 => "--:-:-:-:1 \@P3 $convert loadB1, loadB1;\n", + j6c9 => "--:-:-:-:1 \@P3 $convert loadB2, loadB2;\n", + j6c13 => "--:-:3:-:1 \@P3 $convert loadB3, loadB3;\n", + ) + ), + + j5c31 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*128>], loadA;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, param_lda8;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c31 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<8*128>], loadB;\n", + + j6c46 => "--:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" . + "--:-:-:-:1 IADD32I k, k, -8;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; ++] + + diff --git a/Kernel/SGEMM/Pascal/hgemm_tn_128x16.sass b/Kernel/SGEMM/Pascal/hgemm_tn_128x16.sass new file mode 100644 index 0000000..5cd8cce --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_tn_128x16.sass @@ -0,0 +1,554 @@ +# Kernel: hgemm_tn_128x16 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 16*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 16-17 : Rand<0-1> + + 18-47 ~ lda, ldb, ldaz, ldbz, lda8, ldb8, ta, tb, tid1, tid96, tidAX, tidBX, tidY, txa, txb, dimA, flag + + 0-15 : czero<00-15> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + + 16-23 : j0Ay<0-3>, j0Bx<0-3> + 24-31 : j1Ay<0-3>, j1Bx<0-3> + 32-39 : j2Ay<0-3>, j2Bx<0-3> + 40-47 : j3Ay<0-3>, j3Bx<0-3> + + 48-55 : load0A<0-7> + 56-63 : load1A<0-7> + 64-71 : load2A<0-7> + 72-79 : load3A<0-7> + + 80-83 : load<0-3>B + + 84-87 : track0A<0-1>, track0B<0-1> + 88-91 : track1A<0-1>, track1B<0-1> + 92-95 : track2A<0-1>, track2B<0-1> + 96-99 : track3A<0-1>, track3B<0-1> + + 100-104 ~ writeAs, writeBs, k, lda32, ldb32 + 105-112 ~ readAs, readBs, tid, blkA, blkB, blkZ, tbid, seed + + 16-25 : c<0-3>, b<0-1>, d3, d2, d1, d0 + 26-27 : Cy<0-1> + 28-104 ~ ldc, ldcz, ldc1, writeCs, readCs, tidCX, tidCY, cx, cy, ci, xmad_c, alpha, beta, flags, tid31, lfsr<0-2>, exp<0-3>, rand<0-3>, lfsr<0-2>_1, lfsr<0-2>_2, clk_shf1, clk_shf2 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 STS.128 [addr_zero], RZ; + +--:-:-:-:1 LDS.U.128 czero00, [addr_zero]; +--:-:-:-:1 LDS.U.128 czero04, [addr_zero]; +--:-:-:-:1 LDS.U.128 czero08, [addr_zero]; +--:-:-:-:1 LDS.U.128 czero12, [addr_zero]; + +// Grab a seed for this thread +// (blkB*gridDimA*256 + blkA*256 + tid) & (1024*256 - 1) +--:-:-:-:1 MOV flag, param_flags; +--:-:-:-:1 LOP.AND.NZ P4, RZ, flag, 0x1; +--:-:-:-:1 MOV dimA, gridDimA; +03:-:-:-:1 ISCADD tbid, blkA, tid, 8; +04:-:-:-:1 XMAD.U16.U16 dimA, blkB, dimA, RZ; +--:-:-:-:1 ISCADD tbid, dimA, tbid, 8; +--:-:-:-:1 LOP.AND seed, tbid, 1x<2048*32 - 1>; +--:-:-:-:1 LEA Rand0.CC, seed, param_Rand[0], 0x2; +--:-:-:-:1 LEA.HI.X Rand1, seed, param_Rand[1], RZ, 0x2; +--:-:-:-:1 @P4 LDG.E.CS seed, [Rand]; + +// tidBX = tid & 15 +// tidAX = (tid & 15) << 3 +// tidY = (tid >> 4) & 7 +01:-:-:-:1 LOP.AND tidBX, tid, 15; +--:-:-:-:1 SHL tidAX, tidBX, 3; +--:-:-:-:1 BFE.U32 tidY, tid, 0x304; // 3 bits at position 4 + +--:-:-:-:1 MOV lda8, param_lda8; +--:-:-:-:1 MOV ldb8, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda8, 4; +--:-:-:-:1 SHR.U32 ldb, ldb8, 4; +--:-:-:-:1 SHL lda32, lda8, 2; +--:-:-:-:1 SHL ldb32, ldb8, 2; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + + +// trackA += (blkA*128 + lda*tidY + tidAX) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// trackB += (blkB*16 + ldb*tidY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 4; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA track0B0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X track0B1, tb, param_B[1], RZ, 0x1; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeAs = (128*tidY + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidAX, 7; +--:-:-:-:1 SHL writeAs, writeAs, 2; + +// writeBs = (16*tidY + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidBX, 4; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<128*8>, 2; + +// Start the read buffers low +// readAs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readAs, readAs, tid96; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid & 0x10) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readBs, tid, 0x10; +--:-:-:-:1 SHR.U32 readBs, readBs, 3; +--:-:-:-:1 LOP.OR readBs, readBs, tid1; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 IADD track1A0.CC, track0A0, lda8; +--:-:-:-:1 IADD.X track1A1, track0A1, RZ; +--:-:-:-:1 IADD track1B0.CC, track0B0, ldb8; +--:-:-:-:1 IADD.X track1B1, track0B1, RZ; + +--:-:-:-:1 IADD track2A0.CC, track1A0, lda8; +--:-:-:-:1 IADD.X track2A1, track1A1, RZ; +--:-:-:-:1 IADD track2B0.CC, track1B0, ldb8; +--:-:-:-:1 IADD.X track2B1, track1B1, RZ; + +--:-:-:-:1 IADD track3A0.CC, track2A0, lda8; +--:-:-:-:1 IADD.X track3A1, track2A1, RZ; +--:-:-:-:1 IADD track3B0.CC, track2B0, ldb8; +--:-:-:-:1 IADD.X track3B1, track2B1, RZ; + + +--:-:3:-:1 @P5 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P6 LDG.E.CI.S16 load0B, [track0B]; + +--:-:4:-:1 @P5 LDG.E.CI.128 load1A, [track1A]; +--:-:4:-:1 @P6 LDG.E.CI.S16 load1B, [track1B]; + +--:-:5:-:1 @P5 LDG.E.CI.128 load2A, [track2A]; +--:-:5:-:1 @P6 LDG.E.CI.S16 load2B, [track2B]; + +--:-:6:-:1 @P5 LDG.E.CI.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.S16 load3B, [track3B]; + + +--:-:-:-:1 ISETP.GE.AND P0, PT, k, 32, PT; +--:-:-:-:1 ISETP.GT.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GT.AND P4, PT, k, 32, P6; +--:-:-:-:1 IADD k, k, -32; + + +04:-:-:-:4 F2F.F32.F16 load0A7, load0A3.H1; +--:-:-:-:4 F2F.F32.F16 load0A6, load0A3.H0; +--:-:-:-:0 IADD track0A0.CC, track0A0, lda32; +--:-:-:-:4 F2F.F32.F16 load0A5, load0A2.H1; +--:-:1:-:4 F2F.F32.F16 load0A4, load0A2.H0; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; +--:-:-:-:4 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:4 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:0 IADD track0B0.CC, track0B0, ldb32; +--:-:-:-:4 F2F.F32.F16 load0A1, load0A0.H1; +--:-:2:-:4 F2F.F32.F16 load0A0, load0A0.H0; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; +--:-:3:-:1 F2F.F32.F16 load0B, load0B; + +01:-:-:-:1 STS.128 [writeAs + 4x<0*(128*8 + 16*8) + 4>], load0A4; +02:-:-:-:1 STS.128 [writeAs + 4x<0*(128*8 + 16*8) + 0>], load0A0; +04:-:-:-:1 STS [writeBs + 4x<0*(128*8 + 16*8) + 0>], load0B; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 0*(128*8 + 16*8)>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*16 + 0*(128*8 + 16*8)>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 0*(128*8 + 16*8)>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*16 + 0*(128*8 + 16*8)>]; +--:-:3:-:1 @P3 LDG.E.CI.128 load0A, [track0A]; +--:-:3:-:1 @P4 LDG.E.CI.S16 load0B, [track0B]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + + foreach my $k (0 .. 3) + { + my $shareBuf = ($k + 1) & 1; + my $store = ($k + 1) & 3; + my $loadBar = $store + 3; + my $storBar = sprintf '%02x', 1 << ($store + 2); + + %insert = + ( + j0c11 => "$storBar:-:-:-:1 \@P0 F2F.F32.F16 load${store}A7, load${store}A3.H1;\n", + j0c15 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A6, load${store}A3.H0;\n", + j1c3 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A5, load${store}A2.H1;\n", + j1c7 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A4, load${store}A2.H0;\n", + j1c11 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A3, load${store}A1.H1;\n", + j1c15 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A2, load${store}A1.H0;\n", + j2c3 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A1, load${store}A0.H1;\n", + j2c7 => "--:-:-:-:1 \@P0 F2F.F32.F16 load${store}A0, load${store}A0.H0;\n", + j2c11 => "--:-:$loadBar:-:1 \@P0 F2F.F32.F16 load${store}B, load${store}B;\n", + + j2c12 => "--:-:-:-:1 \@P0 IADD track${store}A0.CC, track${store}A0, lda32;\n", + j3c1 => "--:-:-:-:1 \@P0 IADD.X track${store}A1, track${store}A1, RZ;\n", + j3c3 => "--:-:-:-:1 \@P0 IADD track${store}B0.CC, track${store}B0, ldb32;\n", + j3c8 => "--:-:-:-:1 \@P0 IADD.X track${store}B1, track${store}B1, RZ;\n", + + j3c9 => "$storBar:-:-:-:1 \@P0 STS.128 [writeAs + 4x<$shareBuf*(128*8 + 16*8) + 0>], load${store}A0;\n", + j4c4 => "--:-:-:-:1 \@P0 STS.128 [writeAs + 4x<$shareBuf*(128*8 + 16*8) + 4>], load${store}A4;\n", + j4c6 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<$shareBuf*(128*8 + 16*8) + 0>], load${store}B;\n", + + j5c15 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n", + + j6c1 => "--:-:$loadBar:-:1 \@P3 LDG.E.CI.128 load${store}A, [track${store}A];\n", + j6c3 => "--:-:$loadBar:-:1 \@P4 LDG.E.CI.S16 load${store}B, [track${store}B];\n", + + ($k == 3 ? + ( + j0c4 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 32, PT;\n", + j0c6 => "--:-:-:-:1 ISETP.GT.AND P3, PT, k, 32, P5;\n", + j0c8 => "--:-:-:-:1 ISETP.GT.AND P4, PT, k, 32, P6;\n", + j0c10 => "--:-:-:-:1 IADD k, k, -32;\n", + + j7c15 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : () + ), + ); + + foreach my $j (0 .. 7) + { + my $rsPred = $j >= 6 && $k == 3 ? '@P0' : ' '; + my $barrier = $j & 1 ? 2 : 1; + my $loadReg = ($j + 2) & 3; + my $compute = $j & 3; + my $shareLine = ($j + 2) & 7; + $shareBuf = $j >= 6 ? ($k + 1) & 1 : $k & 1; + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + %d*(128*8 + 16*8)>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shareBuf; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*16 + %d*(128*8 + 16*8)>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shareBuf; + + foreach my $c (0 .. 15) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 8 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + $out .= "\n"; + } + return $out; + + + +// + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 16 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0x1ff; +--:-:-:-:1 LOP.AND readBs, readBs, 0x1ff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 2; + +// tidCX = (tid & 3) << 2 +// tidCY = tid >> 2 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tidCX, tid, 3; +--:-:-:-:1 SHL tidCX, tidCX, 2; +--:-:-:-:1 SHR.U32 tidCY, tid, 2; + +// readCs = (tidCY*16 + tidCX) << 2; +--:-:-:-:1 ISCADD readCs, tidCY, tidCX, 4; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*16 + tidCX; +--:-:-:-:1 ISCADD cx, blkB, tidCX, 4; + +// cy = blkA*128 + tidCY*4 +--:-:-:-:1 SHL cy, tidCY, 2; +--:-:-:-:1 ISCADD cy, blkA, cy, 7; + +// C += (cy*ldc + cx) * 2; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, cy, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA Cy0.CC, ci, param_C[0], 1; +--:-:-:-:0 LEA.HI.X Cy1, ci, param_C[1], RZ, 1; + +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; + +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + +// Random Round flag +--:-:-:-:2 LOP.AND.NZ P4, RZ, flags, 1; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P3, RZ, flags, 2; + +--:-:-:-:1 SHL ldc1, ldc, 1; + +// Seed the Tausworthe +--:-:-:-:1 LOP.XOR lfsr0, seed, tbid; +--:-:-:-:1 CS2R lfsr1, SR_CLOCKLO; +--:-:-:-:1 CS2R lfsr2, SR_GLOBALTIMERLO; +--:-:-:-:1 LOP.AND clk_shf1, lfsr1, 31; +--:-:-:-:1 LOP.AND clk_shf2, lfsr2, 31; +--:-:-:-:1 LOP.XOR clk_shf1, clk_shf1, tid31; +--:-:-:-:1 LOP.XOR clk_shf2, clk_shf2, tid31; +--:-:-:-:1 SHF.R.U64 lfsr1, lfsr1, clk_shf1, lfsr1; +--:-:-:-:1 SHF.R.U64 lfsr2, lfsr2, clk_shf2, lfsr2; +--:-:-:-:1 LOP.AND tbid, tbid, 1x<2048*32 - 1>; + + + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..3) + { + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:6 LEA Rand0.CC, tbid, param_Rand[0], 0x2; +--:-:-:-:1 LEA.HI.X Rand1, tbid, param_Rand[1], RZ, 0x2; +--:-:-:-:2 LOP3.LUT seed, lfsr0, lfsr1, lfsr2, 0x96; +--:-:-:-:1 @P4 STG.E.CS [Rand], seed; + +--:-:-:-:5 EXIT; + + +STORE_C: + +--:-:-:-:2 ISETP.LT.AND P1, PT, cy, param_m, P5; +--:-:-:Y:b ISETP.LT.AND P0, PT, cy, param_m, P6; +--:-:-:-:0 IADD cy, cy, 1; + +--:-:1:-:1 @P1 LDG.E.64 b0, [Cy]; + +// Apply relu +--:-:-:-:1 @P3 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P3 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P3 FMNMX c2, c2, RZ, !PT; +--:-:-:-:4 @P3 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:5:-:1 LDS.U.128 c0, [readCs]; + +01:-:1:-:4 @P1 F2F.F32.F16 d3, b1.H1; +--:-:2:-:4 @P1 F2F.F32.F16 d2, b1.H0; +--:-:3:-:4 @P1 F2F.F32.F16 d1, b0.H1; +--:-:4:-:1 @P1 F2F.F32.F16 d0, b0.H0; + +11:-:-:-:1 @P1 FFMA c3, d3, beta, c3; +02:-:-:-:1 @P1 FFMA c2, d2, beta, c2; +04:-:-:-:1 @P1 FFMA c1, d1, beta, c1; +08:-:-:-:0 @P1 FFMA c0, d0, beta, c0; + +--:-:-:-:5 @P4 BRA.U DO_RANDOM1; + +--:-:1:-:4 F2F.F16.F32 c0, c0; +--:-:2:-:4 F2F.F16.F32 c1, c1; +--:-:3:-:4 F2F.F16.F32 c2, c2; +--:-:4:-:1 F2F.F16.F32 c3, c3; + +--:-:-:-:5 BRA.U END_ROUND1; + +DO_RANDOM1: + +--:-:-:-:5 CAL RANDOM_ROUND; + +END_ROUND1: + +// Pack 2 16 bit values into 32 bit words +03:-:-:-:2 BFI c0, c1, 0x1010, c0; +0c:-:-:-:2 BFI c1, c3, 0x1010, c2; + +--:1:-:-:2 @P0 STG.E.64 [Cy], c0; + +01:-:-:-:6 IADD Cy0.CC, Cy0, ldc1; +--:-:-:-:0 IADD.X Cy1, Cy1, RZ; + +--:-:-:-:5 RET; + +RANDOM_ROUND: + + + +// Strip mantissa and leave sign+exponent +--:-:-:-:1 LOP32I.AND exp0, c0, 0xff800000; +--:-:-:-:1 LOP32I.AND exp1, c1, 0xff800000; +--:-:-:-:1 LOP32I.AND exp2, c2, 0xff800000; +--:-:-:-:1 LOP32I.AND exp3, c3, 0xff800000; + +// Find the exponent that will shift 32 bits of integer data +// out past the lsb of this number as an fp16 +// exp *= 2^-10 * 2^-32 (2^-42) +--:-:-:-:1 FMUL32I exp0, exp0, 0x2a800000; +--:-:-:-:1 FMUL32I exp1, exp1, 0x2a800000; +--:-:-:-:1 FMUL32I exp2, exp2, 0x2a800000; +--:-:-:-:1 FMUL32I exp3, exp3, 0x2a800000; + +// lfsr0 = ((lfsr0 & 0xfffffffe) << 12) ^ (((lfsr0 << 13) ^ lfsr0) >> 19); +--:-:-:-:1 LOP32I.AND lfsr0_1, lfsr0, 0xfffffffe; +--:-:-:-:1 SHL lfsr0_1, lfsr0_1, 12; +--:-:-:-:1 SHL lfsr0_2, lfsr0, 13; +--:-:-:-:1 LOP.XOR lfsr0_2, lfsr0_2, lfsr0; +--:-:-:-:1 SHR.U32 lfsr0_2, lfsr0_2, 19; +--:-:-:-:1 LOP.XOR lfsr0, lfsr0_1, lfsr0_2; + +// lfsr1 = ((lfsr1 & 0xfffffff8) << 4) ^ (((lfsr1 << 2) ^ lfsr1) >> 25); +--:-:-:-:1 LOP32I.AND lfsr1_1, lfsr1, 0xfffffff8; +--:-:-:-:1 SHL lfsr1_1, lfsr1_1, 4; +--:-:-:-:1 SHL lfsr1_2, lfsr1, 2; +--:-:-:-:1 LOP.XOR lfsr1_2, lfsr1_2, lfsr1; +--:-:-:-:1 SHR.U32 lfsr1_2, lfsr1_2, 25; +--:-:-:-:1 LOP.XOR lfsr1, lfsr1_1, lfsr1_2; + +// lfsr2 = ((lfsr2 & 0xfffffff0) << 11) ^ (((lfsr2 << 3) ^ lfsr2) >> 11); +--:-:-:-:1 LOP32I.AND lfsr2_1, lfsr2, 0xfffffff0; +--:-:-:-:1 SHL lfsr2_1, lfsr2_1, 11; +--:-:-:-:1 SHL lfsr2_2, lfsr2, 3; +--:-:-:-:1 LOP.XOR lfsr2_2, lfsr2_2, lfsr2; +--:-:-:-:1 SHR.U32 lfsr2_2, lfsr2_2, 11; +--:-:-:-:1 LOP.XOR lfsr2, lfsr2_1, lfsr2_2; + +// rand = lfsr0 ^ lfsr1 ^ lfsr2; +// generate 3 other rotations of this rand +--:-:-:-:1 LOP3.LUT rand0, lfsr0, lfsr1, lfsr2, 0x96; +--:-:-:-:1 SHF.R.U64 rand1, rand0, 8, rand0; +--:-:-:-:1 SHF.R.U64 rand2, rand0, 16, rand0; +--:-:-:-:0 SHF.R.U64 rand3, rand0, 24, rand0; +//--:-:-:-:1 MOV32I rand0, 0x80000000; +//--:-:-:-:1 MOV32I rand1, 0x80000000; +//--:-:-:-:1 MOV32I rand2, 0x80000000; +//--:-:-:-:1 MOV32I rand3, 0x80000000; + + +// Convert rand to float +--:-:1:-:4 I2F.F32.U32.RZ rand0, rand0; +--:-:2:-:4 I2F.F32.U32.RZ rand1, rand1; +--:-:3:-:4 I2F.F32.U32.RZ rand2, rand2; +--:-:4:-:1 I2F.F32.U32.RZ rand3, rand3; + +// Scale the random number so msb is one below lsb of fp16 +// Add scaled random to number to round +01:-:-:-:1 FFMA.RZ c0, rand0, exp0, c0; +02:-:-:-:1 FFMA.RZ c1, rand1, exp1, c1; +04:-:-:-:1 FFMA.RZ c2, rand2, exp2, c2; +08:-:-:-:0 FFMA.RZ c3, rand3, exp3, c3; + +// Truncate number to fp16 +--:-:1:-:4 F2F.F16.F32.RZ c0, c0; +--:-:2:-:4 F2F.F16.F32.RZ c1, c1; +--:-:3:-:4 F2F.F16.F32.RZ c2, c2; +--:-:4:-:1 F2F.F16.F32.RZ c3, c3; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/hgemm_tn_128x32.sass b/Kernel/SGEMM/Pascal/hgemm_tn_128x32.sass new file mode 100644 index 0000000..239d5d3 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_tn_128x32.sass @@ -0,0 +1,553 @@ +# Kernel: hgemm_tn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + 32*16*2> + szShareA : 128*16 + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 4; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL lda16, lda, 5; +--:-:-:-:1 SHL ldb16, ldb, 5; +--:-:-:-:1 SHL lda4, lda, 2; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = (tid & 31) << 2 +// tidAY = (tid >> 5) +01:-:-:-:1 LOP.AND tidAX, tid, 31; +--:-:-:-:1 SHL tidAX, tidAX, 2; +--:-:-:-:1 SHR.U32 tidAY, tid, 5; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += (blkA*128 + tidAX + lda*tidAY) * 4 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO2 ta0, lda, tidAY, txa; +08:-:-:-:1 XMAD.LO2 ta0, ldaz, blkZ, ta0; +--:-:-:-:1 IADD ta1, ta0, lda4; +--:-:-:-:1 IADD ta2, ta1, lda4; +--:-:-:-:1 IADD ta3, ta2, lda4; + +--:-:-:-:1 LEA track0A0.CC, ta0, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track0A1, ta0, param_A[1], RZ, 1; +--:-:-:-:1 LEA track1A0.CC, ta1, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track1A1, ta1, param_A[1], RZ, 1; +--:-:-:-:1 LEA track2A0.CC, ta2, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track2A1, ta2, param_A[1], RZ, 1; +--:-:-:-:1 LEA track3A0.CC, ta3, param_A[0], 1; +--:-:-:-:1 LEA.HI.X track3A1, ta3, param_A[1], RZ, 1; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 1; + +// writeAs = (tidAY*128 + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidAY1, tidAY, 4; +--:-:-:-:1 IADD tidAY2, tidAY, 8; +--:-:-:-:1 IADD tidAY3, tidAY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.CI.64 load0A, [track0A]; +--:-:2:-:1 @P1 LDG.E.CI.64 load1A, [track1A]; +--:-:3:-:1 @P2 LDG.E.CI.64 load2A, [track2A]; +--:-:4:-:1 @P3 LDG.E.CI.64 load3A, [track3A]; +--:-:5:-:1 @P4 LDG.E.CI.64 loadB, [trackB]; + + + +--:-:6:-:1 @!P0 LDS.U.64 load0A, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.64 load1A, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.64 load2A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.64 load3A, [addr_zero]; +--:-:6:-:2 @!P4 LDS.U.64 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD txa1, txa, 1; +--:-:-:-:1 IADD txa2, txa, 2; +--:-:-:-:1 IADD txa3, txa, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P4; + +--:-:1:-:1 @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:1:-:1 @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:1:-:1 @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:1:-:1 @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:2:-:1 @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:2:-:1 @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:2:-:1 @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:2:-:1 @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P6; + +--:-:3:-:1 @P0 LDG.E.CI.U16 load2A0, [track2A + 2x<0>]; +--:-:3:-:1 @P1 LDG.E.CI.U16 load2A1, [track2A + 2x<1>]; +--:-:3:-:1 @P2 LDG.E.CI.U16 load2A2, [track2A + 2x<2>]; +--:-:3:-:1 @P3 LDG.E.CI.U16 load2A3, [track2A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:4:-:1 @P0 LDG.E.CI.U16 load3A0, [track3A + 2x<0>]; +--:-:4:-:1 @P1 LDG.E.CI.U16 load3A1, [track3A + 2x<1>]; +--:-:4:-:1 @P2 LDG.E.CI.U16 load3A2, [track3A + 2x<2>]; +--:-:4:-:1 @P3 LDG.E.CI.U16 load3A3, [track3A + 2x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:5:-:1 @P0 LDG.E.CI.U16 loadB0, [trackB + 2x<0>]; +--:-:5:-:1 @P1 LDG.E.CI.U16 loadB1, [trackB + 2x<1>]; +--:-:5:-:1 @P2 LDG.E.CI.U16 loadB2, [trackB + 2x<2>]; +--:-:5:-:1 @P3 LDG.E.CI.U16 loadB3, [trackB + 2x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 15; + + + + + our $vec; + return $vec ? q{ +21:-:-:-:1 F2F.F32.F16 load0A3, load0A1.H1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A1.H0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A0.H1; +--:-:1:-:1 F2F.F32.F16 load0A0, load0A0.H0; + +02:-:-:-:1 F2F.F32.F16 load1A3, load1A1.H1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A1.H0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A0.H1; +--:-:2:-:1 F2F.F32.F16 load1A0, load1A0.H0; + +04:-:-:-:1 F2F.F32.F16 load2A3, load2A1.H1; +--:-:-:-:1 F2F.F32.F16 load2A2, load2A1.H0; +--:-:-:-:1 F2F.F32.F16 load2A1, load2A0.H1; +--:-:3:-:1 F2F.F32.F16 load2A0, load2A0.H0; + +08:-:-:-:1 F2F.F32.F16 load3A3, load3A1.H1; +--:-:-:-:1 F2F.F32.F16 load3A2, load3A1.H0; +--:-:-:-:1 F2F.F32.F16 load3A1, load3A0.H1; +--:-:4:-:1 F2F.F32.F16 load3A0, load3A0.H0; + +10:-:-:-:1 F2F.F32.F16 loadB3, loadB1.H1; +--:-:-:-:1 F2F.F32.F16 loadB2, loadB1.H0; +--:-:-:-:1 F2F.F32.F16 loadB1, loadB0.H1; +--:-:5:-:1 F2F.F32.F16 loadB0, loadB0.H0; + } : q{ +21:-:-:-:1 F2F.F32.F16 load0A0, load0A0; +--:-:-:-:1 F2F.F32.F16 load0A1, load0A1; +--:-:-:-:1 F2F.F32.F16 load0A2, load0A2; +--:-:1:-:1 F2F.F32.F16 load0A3, load0A3; + +02:-:-:-:1 F2F.F32.F16 load1A0, load1A0; +--:-:-:-:1 F2F.F32.F16 load1A1, load1A1; +--:-:-:-:1 F2F.F32.F16 load1A2, load1A2; +--:-:2:-:1 F2F.F32.F16 load1A3, load1A3; + +04:-:-:-:1 F2F.F32.F16 load2A0, load2A0; +--:-:-:-:1 F2F.F32.F16 load2A1, load2A1; +--:-:-:-:1 F2F.F32.F16 load2A2, load2A2; +--:-:3:-:1 F2F.F32.F16 load2A3, load2A3; + +08:-:-:-:1 F2F.F32.F16 load3A0, load3A0; +--:-:-:-:1 F2F.F32.F16 load3A1, load3A1; +--:-:-:-:1 F2F.F32.F16 load3A2, load3A2; +--:-:4:-:1 F2F.F32.F16 load3A3, load3A3; + +10:-:-:-:1 F2F.F32.F16 loadB0, loadB0; +--:-:-:-:1 F2F.F32.F16 loadB1, loadB1; +--:-:-:-:1 F2F.F32.F16 loadB2, loadB2; +--:-:5:-:1 F2F.F32.F16 loadB3, loadB3; + }; + + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 16, P1; + +01:-:-:-:1 STS.128 [writeAs + 4x<0*128>], load0A; +--:-:-:-:6 IADD track0A0.CC, track0A0, lda16; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS.128 [writeAs + 4x<4*128>], load1A; +--:-:-:-:6 IADD track1A0.CC, track1A0, lda16; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS.128 [writeAs + 4x<8*128>], load2A; +--:-:-:-:6 IADD track2A0.CC, track2A0, lda16; +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS.128 [writeAs + 4x<12*128>], load3A; +--:-:-:-:6 IADD track3A0.CC, track3A0, lda16; +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P5 LDG.E.CI.64 load0A, [track0A]; +--:-:4:-:1 @P5 LDG.E.CI.64 load1A, [track1A]; +--:-:5:-:1 @P5 LDG.E.CI.64 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.CI.64 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.64 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P5 LDG.E.CI.U16 load0A0, [track0A + 2x<0>]; +--:-:3:-:1 @P5 LDG.E.CI.U16 load0A1, [track0A + 2x<1>]; +--:-:3:-:1 @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>]; +--:-:3:-:1 @P5 LDG.E.CI.U16 load0A3, [track0A + 2x<3>]; + +--:-:4:-:1 @P5 LDG.E.CI.U16 load1A0, [track1A + 2x<0>]; +--:-:4:-:1 @P5 LDG.E.CI.U16 load1A1, [track1A + 2x<1>]; +--:-:4:-:1 @P5 LDG.E.CI.U16 load1A2, [track1A + 2x<2>]; +--:-:4:-:1 @P5 LDG.E.CI.U16 load1A3, [track1A + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load2A0, [track2A + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2A1, [track2A + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2A2, [track2A + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load2A3, [track2A + 2x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>]; +--:-:5:-:1 @P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>]; +--:-:6:-:1 @P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>]; + }; + + + + our $vec; + our $shiftAX = 0; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n", + j9c6 => "10:5:-:-:1 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n", + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, lda16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, lda16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2A0.CC, track2A0, lda16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, lda16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.64 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.64 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.64 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.64 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.64 loadB, [trackB];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0A3, load0A1.H1;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A2, load0A1.H0;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A1, load0A0.H1;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0A0, load0A0.H0;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1A3, load1A1.H1;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A2, load1A1.H0;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A1, load1A0.H1;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1A0, load1A0.H0;\n", + + j6c13 => "10:-:-:-:1 \@P5 F2F.F32.F16 load2A3, load2A1.H1;\n", + j6c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2A2, load2A1.H0;\n", + j6c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2A1, load2A0.H1;\n", + j6c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load2A0, load2A0.H0;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A3, load3A1.H1;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A2, load3A1.H0;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A1, load3A0.H1;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3A0, load3A0.H0;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n", + j10c17 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n", + ) : + ( + + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];\n", + + j2c13 => "04:-:-:-:1 \@P2 F2F.F32.F16 load0A0, load0A0;\n", + j2c17 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A1, load0A1;\n", + j2c21 => "--:-:-:-:1 \@P2 F2F.F32.F16 load0A2, load0A2;\n", + j2c25 => "--:-:3:-:1 \@P2 F2F.F32.F16 load0A3, load0A3;\n", + + j4c13 => "08:-:-:-:1 \@P3 F2F.F32.F16 load1A0, load1A0;\n", + j4c17 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A1, load1A1;\n", + j4c21 => "--:-:-:-:1 \@P3 F2F.F32.F16 load1A2, load1A2;\n", + j4c25 => "--:-:4:-:1 \@P3 F2F.F32.F16 load1A3, load1A3;\n", + + j6c13 => "10:-:-:-:1 \@P5 F2F.F32.F16 load2A0, load2A0;\n", + j6c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2A1, load2A1;\n", + j6c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load2A2, load2A2;\n", + j6c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load2A3, load2A3;\n", + + j8c13 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A0, load3A0;\n", + j8c17 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A1, load3A1;\n", + j8c21 => "--:-:-:-:1 \@P5 F2F.F32.F16 load3A2, load3A2;\n", + j8c25 => "--:-:5:-:1 \@P5 F2F.F32.F16 load3A3, load3A3;\n", + + j10c13 => "20:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB3;\n", + j10c17 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB2, loadB2;\n", + j10c21 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB1;\n", + j10c25 => "--:-:6:-:1 \@P6 F2F.F32.F16 loadB0, loadB0;\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/hgemm_tn_128x64.sass b/Kernel/SGEMM/Pascal/hgemm_tn_128x64.sass new file mode 100644 index 0000000..0404ab5 --- /dev/null +++ b/Kernel/SGEMM/Pascal/hgemm_tn_128x64.sass @@ -0,0 +1,389 @@ +# Kernel: hgemm_tn_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 64*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ tid, blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, ta, tb, tid1, tid15, tidX, x<1-3|65-67>, y<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-107 : loadA<0-7>, loadB<0-3> + 108-111 : trackA<0-1>, trackB<0-1> + + 112-122 ~ writeAs, writeBs, k, txa00, txa64, txb, tidY, swapBuf + 123-127 : readAs, readBs + + 64-83 ~ ldc, ldcz, ci, xmad_c, threadId, tid31, tid96, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C04y<0-1>, C08y<0-1>, C12y<0-1>, C00y<0-1> + 86-107 ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +// tidX = (tid & 15) << 2 +// tidY = (tid >> 4) & 7 +01:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidX, tid15, 2; +--:-:-:-:1 BFE.U32 tidY, tid, 0x304; // 3 bits at position 4 + +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 4; +--:-:-:-:1 SHR.U32 ldb, ldb, 4; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// trackA += (blkA*128 + lda*tidY + tidX) * 2 +02:-:-:-:1 ISCADD txa00, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa00; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x1; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x1; +--:-:-:-:1 IADD txa64, txa00, 64; + +// trackB += (blkB*64 + ldb*tidY + tidX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x1; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x1; + +// Start the write buffers high +// writeAs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2; +// writeBs = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2; + +// Start the read buffers low +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x<64*8 + 128*8>; + + +REMAINDER: + + + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + + + our $vec; + return $vec ? q{ +// doLoad = tidY < k && txa00|txb < n|m +--:-:-:-:1 ISETP.LT.AND P4, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY, k, P6; + + +--:-:2:-:1 @P4 LDG.E.CI.64 loadA0, [trackA + 2x<00>]; +--:-:3:-:1 @P5 LDG.E.CI.64 loadA4, [trackA + 2x<64>]; +--:-:4:-:1 @P6 LDG.E.CI.64 loadB0, [trackB]; + +--:-:5:-:1 @!P4 LDS.U.64 loadA0, [addr_zero]; +--:-:5:-:1 @!P5 LDS.U.64 loadA4, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.64 loadB0, [addr_zero]; + + + } : q{ +// doLoadA = tidY < k && txa00 < m +// doLoadB = tidY < k && txb < n +--:-:-:-:1 IADD x1, txa00, 1; +--:-:-:-:1 IADD x2, txa00, 2; +--:-:-:-:1 IADD x3, txa00, 3; +--:-:-:-:1 IADD x65, txa64, 1; +--:-:-:-:1 IADD x66, txa64, 2; +--:-:-:-:1 IADD x67, txa64, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_m, P0; + +--:-:2:-:1 @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<00 + 0>]; +--:-:2:-:1 @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<00 + 1>]; +--:-:2:-:1 @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<00 + 2>]; +--:-:2:-:1 @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x65, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x66, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x67, param_m, P0; + +--:-:3:-:1 @P0 LDG.E.CI.S16 loadA4, [trackA + 2x<00 + 64>]; +--:-:3:-:1 @P1 LDG.E.CI.S16 loadA5, [trackA + 2x<00 + 65>]; +--:-:3:-:1 @P2 LDG.E.CI.S16 loadA6, [trackA + 2x<00 + 66>]; +--:-:3:-:1 @P3 LDG.E.CI.S16 loadA7, [trackA + 2x<00 + 67>]; + +--:-:-:-:1 @!P0 MOV loadA4, RZ; +--:-:-:-:1 @!P1 MOV loadA5, RZ; +--:-:-:-:1 @!P2 MOV loadA6, RZ; +--:-:-:-:1 @!P3 MOV loadA7, RZ; + +--:-:-:-:1 IADD y1, txb, 1; +--:-:-:-:1 IADD y2, txb, 2; +--:-:-:-:1 IADD y3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_n, P0; + +--:-:4:-:1 @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>]; +--:-:4:-:1 @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>]; +--:-:4:-:1 @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>]; +--:-:4:-:1 @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + }; + + + + + + our $vec; + return $vec ? q{ +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 7; + +12:-:-:-:4 F2F.F32.F16 loadA3, loadA1.H1; +--:-:-:-:4 F2F.F32.F16 loadA2, loadA1.H0; +--:-:-:-:4 F2F.F32.F16 loadA1, loadA0.H1; +--:-:2:-:4 F2F.F32.F16 loadA0, loadA0.H0; + +04:-:-:-:4 F2F.F32.F16 loadA7, loadA5.H1; +--:-:-:-:0 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:4 F2F.F32.F16 loadA6, loadA5.H0; +--:-:-:-:4 F2F.F32.F16 loadA5, loadA4.H1; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; +--:-:3:-:1 F2F.F32.F16 loadA4, loadA4.H0; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; + +02:-:-:-:1 STS.128 [writeAs + 4x<00>], loadA0; +04:-:-:-:1 STS.128 [writeAs + 4x<64>], loadA4; + +28:-:-:-:4 F2F.F32.F16 loadB3, loadB1.H1; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 F2F.F32.F16 loadB2, loadB1.H0; +--:-:-:-:4 F2F.F32.F16 loadB1, loadB0.H1; +--:-:2:-:2 F2F.F32.F16 loadB0, loadB0.H0; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +02:-:-:-:1 STS.128 [writeBs], loadB0; + + // scalar loads + } : q{ +// bDoRemainder = k > 8 +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +02:-:-:-:4 F2F.F32.F16 loadA0, loadA0; +--:-:-:-:4 F2F.F32.F16 loadA1, loadA1; +--:-:-:-:4 F2F.F32.F16 loadA2, loadA2; +--:-:2:-:4 F2F.F32.F16 loadA3, loadA3; + +04:-:-:-:4 F2F.F32.F16 loadA4, loadA4; +--:-:-:-:0 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:4 F2F.F32.F16 loadA5, loadA5; +--:-:-:-:4 F2F.F32.F16 loadA6, loadA6; +--:-:3:-:1 F2F.F32.F16 loadA7, loadA7; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +02:-:-:-:1 STS.128 [writeAs + 4x<00>], loadA0; +04:-:-:-:1 STS.128 [writeAs + 4x<64>], loadA4; + +08:-:-:-:4 F2F.F32.F16 loadB0, loadB0; +--:-:-:-:0 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:4 F2F.F32.F16 loadB1, loadB1; +--:-:-:-:4 F2F.F32.F16 loadB2, loadB2; +--:-:2:-:2 F2F.F32.F16 loadB3, loadB3; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +02:-:-:-:1 STS.128 [writeBs], loadB0; + + }; + + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P4, PT, k, $k_end, P4;\n"); + + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, $k_end, P5;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, $k_end, P6;\n", + j0c5 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c7 => "--:-:-:-:1 IADD32I k, k, -8;\n", + + ($vec ? + ( + j0c8 => "--:-:2:-:1 \@P4 LDG.E.CI.64 loadA0, [trackA + 2x<00>];\n", + j0c11 => "--:-:3:-:1 \@P5 LDG.E.CI.64 loadA4, [trackA + 2x<64>];\n", + j0c14 => "--:-:4:-:1 \@P6 LDG.E.CI.64 loadB0, [trackB];\n", + + j4c3 => "02:-:-:-:1 \@P4 F2F.F32.F16 loadA3, loadA1.H1;\n", + j4c7 => "--:-:-:-:1 \@P4 F2F.F32.F16 loadA2, loadA1.H0;\n", + j4c11 => "--:-:-:-:1 \@P4 F2F.F32.F16 loadA1, loadA0.H1;\n", + j4c15 => "--:-:2:-:1 \@P4 F2F.F32.F16 loadA0, loadA0.H0;\n", + + j5c3 => "04:-:-:-:1 \@P5 F2F.F32.F16 loadA7, loadA5.H1;\n", + j5c7 => "--:-:-:-:1 \@P5 F2F.F32.F16 loadA6, loadA5.H0;\n", + j5c11 => "--:-:-:-:1 \@P5 F2F.F32.F16 loadA5, loadA4.H1;\n", + j5c15 => "--:-:3:-:1 \@P5 F2F.F32.F16 loadA4, loadA4.H0;\n", + + j6c3 => "08:-:-:-:1 \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n", + j6c7 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n", + j6c11 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n", + j6c15 => "--:-:4:-:1 \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n", + ) : + ( + j0c10 => "--:-:2:-:1 \@P4 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n", + j0c12 => "--:-:2:-:1 \@P4 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n", + j0c14 => "--:-:2:-:1 \@P4 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n", + j0c16 => "--:-:2:-:1 \@P4 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n", + + j0c33 => "--:-:3:-:1 \@P5 LDG.E.CI.S16 loadA4, [trackA + 2x<64>];\n", + j0c35 => "--:-:3:-:1 \@P5 LDG.E.CI.S16 loadA5, [trackA + 2x<65>];\n", + j0c37 => "--:-:3:-:1 \@P5 LDG.E.CI.S16 loadA6, [trackA + 2x<66>];\n", + j0c39 => "--:-:3:-:1 \@P5 LDG.E.CI.S16 loadA7, [trackA + 2x<67>];\n", + + j1c10 => "--:-:4:-:1 \@P6 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n", + j1c12 => "--:-:4:-:1 \@P6 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n", + j1c14 => "--:-:4:-:1 \@P6 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n", + j1c16 => "--:-:4:-:1 \@P6 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n", + + j4c3 => "02:-:-:-:1 \@P4 F2F.F32.F16 loadA0, loadA0;\n", + j4c7 => "--:-:-:-:1 \@P4 F2F.F32.F16 loadA1, loadA1;\n", + j4c11 => "--:-:-:-:1 \@P4 F2F.F32.F16 loadA2, loadA2;\n", + j4c15 => "--:-:2:-:1 \@P4 F2F.F32.F16 loadA3, loadA3;\n", + + j5c3 => "04:-:-:-:1 \@P5 F2F.F32.F16 loadA4, loadA4;\n", + j5c7 => "--:-:-:-:1 \@P5 F2F.F32.F16 loadA5, loadA5;\n", + j5c11 => "--:-:-:-:1 \@P5 F2F.F32.F16 loadA6, loadA6;\n", + j5c15 => "--:-:3:-:1 \@P5 F2F.F32.F16 loadA7, loadA7;\n", + + j6c3 => "08:-:-:-:1 \@P6 F2F.F32.F16 loadB0, loadB0;\n", + j6c7 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB1, loadB1;\n", + j6c11 => "--:-:-:-:1 \@P6 F2F.F32.F16 loadB2, loadB2;\n", + j6c15 => "--:-:4:-:1 \@P6 F2F.F32.F16 loadB3, loadB3;\n", + ) + ), + + j4c31 => "02:-:-:-:1 \@P0 STS.128 [writeAs + 4x<00>], loadA0;\n", + j5c31 => "04:-:-:-:1 \@P0 STS.128 [writeAs + 4x<64>], loadA4;\n", + + j5c46 => "--:-:-:-:1 \@P0 IADD trackA0.CC, trackA0, param_lda8;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X trackA1, trackA1, RZ;\n", + + j6c31 => "08:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c46 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c54 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_common_128x128.sass b/Kernel/SGEMM/Pascal/sgemm_common_128x128.sass new file mode 100644 index 0000000..703af8f --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_common_128x128.sass @@ -0,0 +1,309 @@ +# sgemm_common_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? '01' : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:-:-:1 IADD loop, loop, 1; +--:-:-:-:1 IADD ta, ta, param_ldaz; +--:-:-:-:1 IADD tb, tb, param_ldbz; +--:-:-:-:3 MOV k, param_k; +--:-:-:-:1 ISETP.LT.AND P1, PT, loop, param_loops, PT; +--:-:-:-:6 LEA trackA0.CC, ta, param_A[0], 2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 2; +--:-:-:-:6 LEA trackB0.CC, tb, param_B[0], 2; +--:-:-:-:0 LEA.HI.X trackB1, tb, param_B[1], RZ, 2; +--:-:-:Y:5 @P1 BRA.U REMAINDER; + +--:-:1:-:1 S2R blockA, SR_CTAID.Y; +--:-:2:-:1 S2R blockB, SR_CTAID.Z; +--:-:3:-:1 S2R blockZ, SR_CTAID.X; + + +--:-:-:-:1 LOP.AND tid_31, tid, 31; +--:-:-:-:1 LOP.AND tid_96, tid, 96; +--:-:-:-:1 LOP.AND tid_128, tid, 128; + +// writeCs = (readAs / 4) * 128 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0xfff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xfff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 5; + +// cx = tid_31 | (tid_128 >> 2); +--:-:-:-:1 SHR.U32 cx00, tid_128, 2; +--:-:-:-:1 LOP.OR cx00, tid_31, cx00; + +// readCs = ((tid_96 << 4) | cx) << 2; +--:-:-:-:1 SHL readCs, tid_96, 4; +--:-:-:-:1 LOP.OR readCs, readCs, cx00; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx += blockB*128; +02:-:-:-:1 ISCADD cx00, blockB, cx00, 7; +--:-:-:-:1 IADD cx64, cx00, 64; + +// cy = blockA*128 + (tid_96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid_96, 1; +01:-:-:-:1 ISCADD cy00, blockA, cy00, 7; + +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx00, xmad_c; +04:-:-:-:1 XMAD.LO2 ci, ldcz, blockZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// Apply beta +--:-:-:-:1 ISETP.NE.AND P6, PT, beta, RZ, PT; + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y + 4x<00>]; +--:-:2:-:1 @P1 LDG.E d1, [C00y + 4x<64>]; +--:-:3:-:1 @P2 LDG.E d2, [C04y + 4x<00>]; +--:-:4:-:1 @P3 LDG.E d3, [C04y + 4x<64>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c4, c4, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c5, c5, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c6, c6, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c7, c7, RZ, !PT; + +// beta != 0 +--:-:-:-:7 ISETP.NE.AND P6, PT, beta, RZ, PT; + + +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<64>], c4; +--:-:-:-:1 LDS c0, [readCs + 4x<0*128 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<0*128 + 64>]; + +--:-:-:-:1 LDS c2, [readCs + 4x<1*128 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<1*128 + 64>]; + + + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:1 @P6 FFMA c3, d3, beta, c3; + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, P6; + +--:-:-:-:1 @P0 STG.E.CG [C00y0 + 4x<00>], c0; +--:5:-:-:1 @P1 STG.E.CG [C00y0 + 4x<64>], c1; +--:-:-:-:1 @P2 STG.E.CG [C04y0 + 4x<00>], c2; +--:6:-:-:1 @P3 STG.E.CG [C04y0 + 4x<64>], c3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C08y0 + 4x<00>]; +--:-:2:-:1 @P1 LDG.E d1, [C08y0 + 4x<64>]; +--:-:3:-:1 @P2 LDG.E d2, [C12y0 + 4x<00>]; +--:-:4:-:1 @P3 LDG.E d3, [C12y0 + 4x<64>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx64, param_n, PT; + +--:-:-:-:2 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:2 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + + +10:-:-:-:4 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +20:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:0 IADD.X C04y1, C04y1, RZ; + +--:-:-:-:1 LDS c0, [readCs + 4x<2*128 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<2*128 + 64>]; +--:-:-:-:1 LDS c2, [readCs + 4x<3*128 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*128 + 64>]; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P6 FFMA c3, d3, beta, c3; + +01:-:-:-:1 @P0 STG.E.CG [C08y0 + 4x<00>], c0; +02:5:-:-:1 @P1 STG.E.CG [C08y0 + 4x<64>], c1; +04:-:-:-:1 @P2 STG.E.CG [C12y0 + 4x<00>], c2; +08:6:-:-:1 @P3 STG.E.CG [C12y0 + 4x<64>], c3; + +10:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +20:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/sgemm_common_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_common_128x32.sass new file mode 100644 index 0000000..928ad6b --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_common_128x32.sass @@ -0,0 +1,240 @@ +# sgemm_common_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 32 + readBs; +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 2; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +--:-:-:-:1 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + +// Apply relu +--:-:-:-:0 LOP.AND.NZ P4, RZ, flags, 2; +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + + +--:-:-:-:4 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 MOV d0, RZ; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:4 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 MOV d1, RZ; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:3 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 MOV d2, RZ; +--:-:-:-:1 MOV d3, RZ; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y]; +--:-:2:-:1 @P1 LDG.E d1, [C04y]; +--:-:3:-:1 @P2 LDG.E d2, [C08y]; +--:-:4:-:1 @P3 LDG.E d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:3 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*32>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*32>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*32>]; + + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P5 FFMA c3, d3, beta, c3; + +--:1:-:-:1 @P0 STG.E.CG [C00y], c0; +--:2:-:-:1 @P1 STG.E.CG [C04y], c1; +--:3:-:-:1 @P2 STG.E.CG [C08y], c2; +--:4:-:-:1 @P3 STG.E.CG [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/sgemm_common_128x64.sass b/Kernel/SGEMM/Pascal/sgemm_common_128x64.sass new file mode 100644 index 0000000..ee1705e --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_common_128x64.sass @@ -0,0 +1,290 @@ +# sgemm_common_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>]; +--:-:1:-:1 LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>]; + +LOOP: + + + + our @top; + our %insert; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2,4,6) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 7) + { + my $odd = $j & 1; + my $nOdd = !$odd + 0; + my $rsOffset = ($j + 1) % 8; + my $rsPred = $j == 7 ? '@P0' : ' '; + + $insert{"j${j}c0"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c2"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c4"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; + $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; + + foreach my $c (0 .. 63) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? '01' : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 32 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $ins; + } + } + return $out; + + + +--:-:1:-:1 S2R tid_2, SR_TID.X; +--:-:2:-:1 S2R blockA, SR_CTAID.Y; +--:-:3:-:1 S2R blockB, SR_CTAID.Z; +--:-:4:-:1 S2R blockZ, SR_CTAID.X; + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 64 + readBs; +--:-:-:-:1 LOP.AND readAs, readAs, 0xff; +--:-:-:-:1 LOP.AND readBs, readBs, 0xff; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 4; + +// readCs = ((tid_2 & 96) << 3) | (tid_2 & 31) << 2; +01:-:-:-:1 LOP.AND tid31, tid_2, 31; +01:-:-:-:1 LOP.AND tid96, tid_2, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 3; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx00 = blockB*64 + tid31; +04:-:-:-:1 ISCADD cx00, blockB, tid31, 6; +--:-:-:-:1 IADD cx32, cx00, 32; + +// cy = blockA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +02:-:-:-:1 ISCADD cy00, blockA, cy00, 7; + +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx00, xmad_c; +08:-:-:-:1 XMAD.LO2 ci, ldcz, blockZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + + +--:-:-:-:1 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0 + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:1 FMUL c3, cx3y%d, alpha;\n" . + "--:-:-:-:1 FMUL c4, cx4y%d, alpha;\n" . + "--:-:-:-:1 FMUL c5, cx5y%d, alpha;\n" . + "--:-:-:-:1 FMUL c6, cx6y%d, alpha;\n" . + "--:-:-:-:0 FMUL c7, cx7y%d, alpha;\n", + ($y) x 8); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y0 + 4x<00>]; +--:-:2:-:1 @P1 LDG.E d1, [C00y0 + 4x<32>]; +--:-:3:-:1 @P2 LDG.E d2, [C04y0 + 4x<00>]; +--:-:4:-:1 @P3 LDG.E d3, [C04y0 + 4x<32>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy04, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy04, param_m, P5; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P6, RZ, flags, 2; +--:-:-:-:1 @P6 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c3, c3, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c4, c4, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c5, c5, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c6, c6, RZ, !PT; +--:-:-:-:1 @P6 FMNMX c7, c7, RZ, !PT; + +--:-:-:-:7 ISETP.NE.AND P6, PT, beta, RZ, PT; + + +--:-:-:-:1 STS.128 [writeCs+4x<00>], c0; +--:-:-:-:1 STS.128 [writeCs+4x<32>], c4; + +--:-:-:-:1 LDS c0, [readCs + 4x<0*64 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<0*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<1*64 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<1*64 + 32>]; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:1 @P6 FFMA c3, d3, beta, c3; + + +--:-:-:-:1 @P0 STG.E.CS [C00y0 + 4x<00>], c0; +--:5:-:-:1 @P1 STG.E.CS [C00y0 + 4x<32>], c1; +--:-:-:-:1 @P2 STG.E.CS [C04y0 + 4x<00>], c2; +--:6:-:-:1 @P3 STG.E.CS [C04y0 + 4x<32>], c3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, P6; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C08y0 + 4x<00>]; +--:-:2:-:1 @P1 LDG.E d1, [C08y0 + 4x<32>]; +--:-:3:-:1 @P2 LDG.E d2, [C12y0 + 4x<00>]; +--:-:4:-:1 @P3 LDG.E d3, [C12y0 + 4x<32>]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, cx00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, cx32, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy08, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy12, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + + +10:-:-:-:2 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:1 IADD cy12, cy12, 1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +20:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:0 IADD.X C04y1, C04y1, RZ; + +--:-:-:-:1 LDS c0, [readCs + 4x<2*64 + 00>]; +--:-:5:-:1 LDS c1, [readCs + 4x<2*64 + 32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<3*64 + 00>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*64 + 32>]; + +11:-:-:-:1 @P6 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P6 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P6 FFMA c2, d2, beta, c2; +08:-:-:-:1 @P6 FFMA c3, d3, beta, c3; + +--:-:-:-:1 @P0 STG.E.CS [C08y0 + 4x<00>], c0; +--:5:-:-:1 @P1 STG.E.CS [C08y0 + 4x<32>], c1; +--:-:-:-:1 @P2 STG.E.CS [C12y0 + 4x<00>], c2; +--:6:-:-:1 @P3 STG.E.CS [C12y0 + 4x<32>], c3; + +10:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +20:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/sgemm_common_32x128.sass b/Kernel/SGEMM/Pascal/sgemm_common_32x128.sass new file mode 100644 index 0000000..da4d83d --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_common_32x128.sass @@ -0,0 +1,234 @@ +# Kernel: hgemm_common_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*32 + 16 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; + +// writeCs = (readAs / 4) * 128 + readBs; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 5; + +// readCs = tid * 4; +--:-:-:-:1 SHL readCs, tid, 2; + +// cx = blkB*128 + tid; +--:-:-:-:1 ISCADD cx, blkB, tid, 7; + +// cy = blkA*32 +--:-:-:-:1 SHL cy00, blkA, 5; + +// C += (cy*ldc + cx) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; + +--:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, blkZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, param_C[0], 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, param_C[1], RZ, 2; + +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; + +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + +// Apply relu +--:-:-:-:1 LOP.AND.NZ P4, RZ, flags, 2; + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc12, ldc, -ldc4, 6; + + + +--:-:-:-:5 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:5 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:5 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:0 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc12;\n" . + "--:-:-:-:1 IADD cy00, cy00, 12;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc12;\n" . + "--:-:-:-:1 IADD cy04, cy04, 12;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc12;\n" . + "--:-:-:-:1 IADD cy08, cy08, 12;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc12;\n" . + "--:-:-:-:1 IADD cy12, cy12, 12;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y]; +--:-:2:-:1 @P1 LDG.E d1, [C04y]; +--:-:3:-:1 @P2 LDG.E d2, [C08y]; +--:-:4:-:1 @P3 LDG.E d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:3 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*128>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*128>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*128>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*128>]; + + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:0 @P5 FFMA c3, d3, beta, c3; + +--:1:-:-:1 @P0 STG.E.CG [C00y], c0; +--:2:-:-:1 @P1 STG.E.CG [C04y], c1; +--:3:-:-:1 @P2 STG.E.CG [C08y], c2; +--:4:-:-:1 @P3 STG.E.CG [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:0 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_128x128.sass b/Kernel/SGEMM/Pascal/sgemm_nn_128x128.sass new file mode 100644 index 0000000..22b8782 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_nn_128x128.sass @@ -0,0 +1,327 @@ +# Kernel: sgemm_nn_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, xmad_ta, xmad_tb, tid31, tid128 + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 ~ k<1-3>, x<1-3> + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-107 : loadA<0-7>, loadB<0-3> + + 108-111 : trackA<0-1>, trackB<0-1> + + 112-121 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop + 122-127 ~ readAs, readBs, tid + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 MOV loop, RZ; +--:-:-:-:1 STS.128 [addr_zero], RZ; + + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); + + +// tidAY = (tid & 1) << 2 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; + +// tidAX = tid >> 1 +--:-:-:-:1 SHR.U32 tidAX, tid, 1; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) & 7 +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL tidBX, tid31, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x305; // 3 bits at position 5 + +// trackB += (blkB*128 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 7; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeAs = 4 * (128 * tidAY + tidAX) +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<128*8*2>, 2; + +// writeBs = (128*tidBY + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 7; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<128*8*3>, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + + + our $vec; + return $vec ? q{ + +// k must be multiple of 8 +--:-:1:-:1 @P6 LDG.E.CI.128 loadB0, [trackB]; + +--:-:2:-:1 @P5 LDG.E.CI.128 loadA0, [trackA + 4x<0>]; +--:5:6:-:1 @P5 LDG.E.CI.128 loadA4, [trackA + 4x<8>]; + +--:-:3:-:1 @!P6 LDS.U.128 loadB0, [addr_zero]; +--:-:4:-:1 @!P5 LDS.U.128 loadA0, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.128 loadA4, [addr_zero]; + +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +05:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +0a:-:-:-:1 STS [writeAs + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeAs + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*128>], loadA3; + +10:-:-:-:6 IADD trackA0.CC, trackA0, 4x<16>; +--:-:-:-:1 IADD.X trackA1, trackA1, RZ; + + } : q{ + + + +// doLoad0 = tidBY < k +--:-:-:-:1 IADD x1, txb, 1; +--:-:-:-:1 IADD x2, txb, 2; +--:-:-:-:1 IADD x3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_n, P0; + +--:-:6:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + +--:-:-:-:1 IADD k1, tidAY, 1; +--:-:-:-:1 IADD k2, tidAY, 2; +--:-:-:-:1 IADD k3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + + + +// bDoRemainder = k > 8 +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + +20:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128>], loadA0; +04:-:-:-:1 STS [writeAs + 4x<1*128>], loadA1; +08:-:-:-:1 STS [writeAs + 4x<2*128>], loadA2; +10:-:-:-:1 STS [writeAs + 4x<3*128>], loadA3; + +--:-:-:-:6 IADD trackA0.CC, trackA0, 4x<8>; +--:-:-:-:1 IADD.X trackA1, trackA1, RZ; + }; + + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 LOP.XOR writeAs, writeAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR writeBs, writeBs, 4x<128*8*2>; + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c15 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P5;\n", + + j0c10 => "--:-:2:-:1 \@P3 LDG.E.CI.128 loadB0, [trackB];\n", + + j0c28 => "--:-:3:-:1 \@P2 LDG.E.CI.128 loadA0, [trackA + 4x<0>];\n", + j0c30 => "20:5:6:-:1 \@P2 LDG.E.CI.128 loadA4, [trackA + 4x<8>];\n", + + j4c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128>], loadA4;\n", + j4c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128>], loadA5;\n", + j4c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128>], loadA6;\n", + j4c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128>], loadA7;\n", + + j5c35 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c29 => "04:-:-:-:1 \@P1 STS [writeAs + 4x<0*128>], loadA0;\n", + j6c31 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<1*128>], loadA1;\n", + j6c33 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<2*128>], loadA2;\n", + j6c35 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<3*128>], loadA3;\n", + + j6c46 => "10:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 4x<16>;\n", + j6c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n", + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + j0c10 => "--:-:6:-:1 \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j0c29 => "--:-:6:-:1 \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j0c31 => "--:-:6:-:1 \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j0c33 => "--:-:6:-:1 \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + + j0c35 => "--:-:2:-:1 \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j1c29 => "--:-:3:-:1 \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j1c31 => "--:-:4:-:1 \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j1c33 => "--:-:5:-:1 \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + + j5c39 => "20:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c29 => "02:-:-:-:1 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n", + j6c31 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n", + j6c33 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n", + j6c35 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n", + + j6c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n", + j6c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ) + ), + + j5c46 => "--:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + j5c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:0 IADD32I k, k, -8;\n" . + "--:-:-:-:5 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_nn_128x32.sass new file mode 100644 index 0000000..8194777 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_nn_128x32.sass @@ -0,0 +1,485 @@ +# Kernel: sgemm_nn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + 32*16*2> + szShareA : (128*16 + 32) + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, shiftAX + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb16, ldb, 6; +--:-:-:-:1 SHL lda32, lda, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa00, blkA, tidAX, 7; +--:-:-:-:1 IADD txa32, txa00, 32; +--:-:-:-:1 IADD txa64, txa00, 64; +--:-:-:-:1 IADD txa96, txa00, 96; + +--:-:-:-:1 XMAD.LO ta00, lda, txa00, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta00, ldaz, blkZ, ta00; +--:-:-:-:1 IADD ta32, ta00, lda32; +--:-:-:-:1 IADD ta64, ta32, lda32; +--:-:-:-:1 IADD ta96, ta64, lda32; + +--:-:-:-:1 LEA track0A0.CC, ta00, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta00, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, ta32, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track1A1, ta32, param_A[1], RZ, 2; +--:-:-:-:1 LEA track2A0.CC, ta64, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track2A1, ta64, param_A[1], RZ, 2; +--:-:-:-:1 LEA track3A0.CC, ta96, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track3A1, ta96, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 2; + +// writeAs = (tidAY*128 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:2:-:1 @P3 LDG.E.CI.128 load1A, [track1A]; +--:-:3:-:1 @P4 LDG.E.CI.128 load2A, [track2A]; +--:-:4:-:1 @P5 LDG.E.CI.128 load3A, [track3A]; +--:-:5:-:1 @P6 LDG.E.CI.128 loadB, [trackB]; + + + +--:-:6:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load1A, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 load2A, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 load3A, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.128 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:1:-:1 @P0 LDG.E.CI load0A0, [track0A + 4x<0>]; +--:-:1:-:1 @P1 LDG.E.CI load0A1, [track0A + 4x<1>]; +--:-:1:-:1 @P2 LDG.E.CI load0A2, [track0A + 4x<2>]; +--:-:1:-:1 @P3 LDG.E.CI load0A3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI load1A0, [track1A + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI load1A1, [track1A + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI load1A2, [track1A + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI load1A3, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI load2A0, [track2A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI load2A1, [track2A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load2A2, [track2A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI load2A3, [track2A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI load3A0, [track3A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI load3A1, [track3A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI load3A2, [track3A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load3A3, [track3A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:5:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:5:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:5:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P1, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + + + +21:-:-:-:1 STS [writeAs + 4x<0*128 + 0*32>], load0A0; +--:-:-:-:0 IADD track0A0.CC, track0A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 0*32>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 0*32>], load0A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 0*32>], load0A3; + +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128 + 1*32>], load1A0; +--:-:-:-:0 IADD track1A0.CC, track1A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 1*32>], load1A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 1*32>], load1A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 1*32>], load1A3; + +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 2*32>], load2A0; +--:-:-:-:0 IADD track2A0.CC, track2A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 2*32>], load2A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 2*32>], load2A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 2*32>], load2A3; + +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 3*32>], load3A0; +--:-:-:-:0 IADD track3A0.CC, track3A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 3*32>], load3A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 3*32>], load3A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 3*32>], load3A3; + +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.128 load0A, [track0A]; +--:-:4:-:1 @P3 LDG.E.CI.128 load1A, [track1A]; +--:-:5:-:1 @P4 LDG.E.CI.128 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.CI.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.128 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI load0A0, [track0A + 4x<0>]; +--:-:3:-:1 @P2 LDG.E.CI load0A1, [track0A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load0A2, [track0A + 4x<2>]; +--:-:3:-:1 @P2 LDG.E.CI load0A3, [track0A + 4x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI load1A0, [track1A + 4x<0>]; +--:-:4:-:1 @P3 LDG.E.CI load1A1, [track1A + 4x<1>]; +--:-:4:-:1 @P3 LDG.E.CI load1A2, [track1A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load1A3, [track1A + 4x<3>]; + +--:-:5:-:1 @P4 LDG.E.CI load2A0, [track2A + 4x<0>]; +--:-:5:-:1 @P4 LDG.E.CI load2A1, [track2A + 4x<1>]; +--:-:5:-:1 @P4 LDG.E.CI load2A2, [track2A + 4x<2>]; +--:-:5:-:1 @P4 LDG.E.CI load2A3, [track2A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load3A0, [track3A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load3A1, [track3A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load3A2, [track3A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load3A3, [track3A + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P6 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P6 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P6 LDG.E.CI loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n", + + j9c6 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n", + + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 4x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 4x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2A0.CC, track2A0, 4x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, 4x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.128 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI.128 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.128 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI load2A0, [track2A + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P4 LDG.E.CI load2A1, [track2A + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P4 LDG.E.CI load2A2, [track2A + 4x<2>];\n", + j10c3 => "--:-:5:-:1 \@P4 LDG.E.CI load2A3, [track2A + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_128x64.sass b/Kernel/SGEMM/Pascal/sgemm_nn_128x64.sass new file mode 100644 index 0000000..2fca939 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_nn_128x64.sass @@ -0,0 +1,414 @@ +# Kernel: sgemm_nn_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 64*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ tid, blkA, blkB, blkZ, txb, tidAY, tidBY, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, ta, xmad_ta, tb, tid15, xmad_tb, k<1-3>, x<1-3> + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-115 : loadAA<0-7>, loadA<0-7>, loadB<0-3> + + 116-121 : track0A<0-1>, track1A<0-1>, trackB<0-1> + + 122-125 ~ writeAs, writeBs, k, swapBuf + 126-127 ~ readAs, readBs + + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-125 ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +// tidAY = (tid & 1) << 2 +// tidAX = tid >> 1 +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; +01:-:-:-:1 SHR.U32 tidAX, tid, 1; + +// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA track0A0.CC, ta, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, lda, track0A0, 8; +--:-:-:-:1 LEA.HI.X track1A1, lda, track0A1, RZ, 8; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa, param_m, PT; +--:-:-:-:1 IADD txa, txa, 64; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// tidBX = (tid & 15) << 2 +// tidBY = (tid >> 4) & 7 +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidBX, tid15, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4 + +// trackB += (blkB*64 + tidX + ldb*tidBY) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:2 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// Start the write buffers high +// writeAs = (128*tidAY + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2; + +// writeBs = (64*tidBY + tidX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2; + +// Start the read buffers low +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x<64*8 + 128*8>; + + +REMAINDER: + + + our $vec; + return $vec ? q{ + +// k must be multiple of 8 +--:-:2:-:1 @P6 LDG.E.CI.128 loadB0, [trackB]; + +--:-:3:-:1 @P4 LDG.E.CI.128 loadA0, [track0A + 4x<0>]; +--:-:3:-:1 @P4 LDG.E.CI.128 loadAA0, [track0A + 4x<8>]; + +--:-:4:-:1 @P5 LDG.E.CI.128 loadA4, [track1A + 4x<0>]; +--:-:4:-:1 @P5 LDG.E.CI.128 loadAA4, [track1A + 4x<8>]; + +--:-:-:-:1 @!P6 LDS.U.128 loadB0, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 loadA0, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 loadA4, [addr_zero]; +--:-:-:-:1 @!P4 LDS.U.128 loadAA0, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.128 loadAA4, [addr_zero]; + +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +22:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 00>], loadA0; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 00>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 00>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*128 + 00>], loadA3; + +--:-:-:-:6 IADD track0A0.CC, track0A0, 4x<16>; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 64>], loadA4; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 64>], loadA5; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 64>], loadA6; +--:-:-:-:1 STS [writeAs + 4x<3*128 + 64>], loadA7; + +--:-:-:-:6 IADD track1A0.CC, track1A0, 4x<16>; +--:-:-:-:1 IADD.X track1A1, track1A1, RZ; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:0 IADD swapBuf, RZ, -swapBuf; + + } : q{ + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; + +01:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 SHL tidAY, tid1, 2; +--:-:-:-:1 LOP.AND tid15, tid, 15; +--:-:-:-:1 SHL tidBX, tid15, 2; +--:-:-:-:1 BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4 +02:-:-:-:1 ISCADD txb, blkB, tidBX, 6; + +// doLoad0 = tidBY < k +--:-:-:-:1 IADD x1, txb, 1; +--:-:-:-:1 IADD x2, txb, 2; +--:-:-:-:1 IADD x3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_n, P0; + +--:-:2:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 IADD k1, tidAY, 1; +--:-:-:-:1 IADD k2, tidAY, 2; +--:-:-:-:1 IADD k3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI loadA0, [track0A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI loadA1, [track0A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI loadA2, [track0A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI loadA3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI loadA4, [track1A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI loadA5, [track1A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI loadA6, [track1A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI loadA7, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA4, RZ; +--:-:-:-:1 @!P1 MOV loadA5, RZ; +--:-:-:-:1 @!P2 MOV loadA6, RZ; +--:-:-:-:1 @!P3 MOV loadA7, RZ; + + +02:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 00>], loadA0; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 00>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 00>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*128 + 00>], loadA3; + +--:-:-:-:6 IADD track0A0.CC, track0A0, 4x<8>; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 64>], loadA4; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 64>], loadA5; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 64>], loadA6; +--:-:-:-:1 STS [writeAs + 4x<3*128 + 64>], loadA7; + +--:-:-:-:6 IADD track1A0.CC, track1A0, 4x<8>; +--:-:-:-:1 IADD.X track1A1, track1A1, RZ; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + + }; + + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, P6;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + + j0c11 => "--:-:2:-:1 \@P0 LDG.E.CI.128 loadB0, [trackB];\n", + + j0c12 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c13 => "--:-:-:-:1 IADD32I k, k, -8;\n", + + j0c23 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P4;\n", + j0c24 => "--:-:-:-:1 PSETP.AND.AND P3, PT, P0, P1, P5;\n", + + j0c35 => "--:-:3:-:1 \@P2 LDG.E.CI.128 loadA0, [track0A + 4x<0>];\n", + j0c37 => "--:-:3:-:1 \@P2 LDG.E.CI.128 loadAA0, [track0A + 4x<8>];\n", + + j0c39 => "--:-:4:-:1 \@P3 LDG.E.CI.128 loadA4, [track1A + 4x<0>];\n", + j0c41 => "10:6:5:-:1 \@P3 LDG.E.CI.128 loadAA4, [track1A + 4x<8>];\n", + + j2c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 00>], loadAA0;\n", + j2c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 00>], loadAA1;\n", + j2c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 00>], loadAA2;\n", + j2c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 00>], loadAA3;\n", + + j3c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 64>], loadAA4;\n", + j3c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 64>], loadAA5;\n", + j3c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 64>], loadAA6;\n", + j3c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 64>], loadAA7;\n", + + j5c29 => "04:-:-:-:1 \@P1 STS [writeAs + 4x<0*128 + 00>], loadA0;\n", + j5c31 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<1*128 + 00>], loadA1;\n", + j5c33 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<2*128 + 00>], loadA2;\n", + j5c35 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<3*128 + 00>], loadA3;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 4x<16>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + + j6c29 => "08:-:-:-:1 \@P1 STS [writeAs + 4x<0*128 + 64>], loadA4;\n", + j6c31 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<1*128 + 64>], loadA5;\n", + j6c33 => "--:-:-:-:1 \@P1 STS [writeAs + 4x<2*128 + 64>], loadA6;\n", + j6c35 => "--:2:-:-:1 \@P1 STS [writeAs + 4x<3*128 + 64>], loadA7;\n", + + j6c46 => "20:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 4x<16>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P4;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P5;\n", + + j0c10 => "--:-:2:-:1 \@P0 LDG.E.CS loadB0, [trackB + 4x<0>];\n", + j0c12 => "--:-:2:-:1 \@P0 LDG.E.CS loadB1, [trackB + 4x<1>];\n", + j0c14 => "--:-:2:-:1 \@P0 LDG.E.CS loadB2, [trackB + 4x<2>];\n", + j0c16 => "--:-:2:-:1 \@P0 LDG.E.CS loadB3, [trackB + 4x<3>];\n", + + j0c18 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + j0c20 => "--:-:-:-:1 IADD32I k, k, -8;\n", + + j0c33 => "--:-:3:-:1 \@P2 LDG.E.CI loadA0, [track0A + 4x<0>];\n", + j0c35 => "--:-:3:-:1 \@P2 LDG.E.CI loadA1, [track0A + 4x<1>];\n", + j0c37 => "--:-:3:-:1 \@P2 LDG.E.CI loadA2, [track0A + 4x<2>];\n", + j0c39 => "--:-:3:-:1 \@P2 LDG.E.CI loadA3, [track0A + 4x<3>];\n", + + j1c29 => "--:-:4:-:1 \@P3 LDG.E.CI loadA4, [track1A + 4x<0>];\n", + j1c31 => "--:-:4:-:1 \@P3 LDG.E.CI loadA5, [track1A + 4x<1>];\n", + j1c33 => "--:-:4:-:1 \@P3 LDG.E.CI loadA6, [track1A + 4x<2>];\n", + j1c35 => "--:-:4:-:1 \@P3 LDG.E.CI loadA7, [track1A + 4x<3>];\n", + + j5c29 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 00>], loadA0;\n", + j5c31 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 00>], loadA1;\n", + j5c33 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 00>], loadA2;\n", + j5c35 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 00>], loadA3;\n", + + j5c46 => "--:-:-:-:1 \@P0 IADD track0A0.CC, track0A0, 4x<8>;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X track0A1, track0A1, RZ;\n", + + j6c29 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 64>], loadA4;\n", + j6c31 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 64>], loadA5;\n", + j6c33 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 64>], loadA6;\n", + j6c35 => "--:2:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 64>], loadA7;\n", + + j6c46 => "--:-:-:-:1 \@P0 IADD track1A0.CC, track1A0, 4x<8>;\n", + j6c54 => "--:-:-:-:1 \@P0 IADD.X track1A1, track1A1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ) + ), + + j4c21 => "02:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j4c22 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j4c27 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "02:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + + ); + return; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_32x128.sass b/Kernel/SGEMM/Pascal/sgemm_nn_32x128.sass new file mode 100644 index 0000000..e25c3a9 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_nn_32x128.sass @@ -0,0 +1,458 @@ +# Kernel: sgemm_nn_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + (32*16 + 32)*2> + szShareA : (32*16 + 32) + szShareB : (128*16) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadA<0-3> + 84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + + 100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; +--:-:3:-:1 S2R blkA, SR_CTAID.Y; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb4, ldb, 2; +--:-:-:-:1 SHL ldb16, ldb, 6; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 31) << 2 +// tidBY = (tid >> 5) +01:-:-:-:1 LOP.AND tidBX, tid, 31; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 5; + +// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 4 +04:-:-:-:1 ISCADD txa, blkA, tidAX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidAY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 2; + +// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4 +02:-:-:-:1 ISCADD txb, blkB, tidBX, 7; +--:-:-:-:1 XMAD.LO2 tb0, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb0, ldbz, blkZ, tb0; +--:-:-:-:1 IADD tb1, tb0, ldb4; +--:-:-:-:1 IADD tb2, tb1, ldb4; +--:-:-:-:1 IADD tb3, tb2, ldb4; + +--:-:-:-:1 LEA track0B0.CC, tb0, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track0B1, tb0, param_B[1], RZ, 2; +--:-:-:-:1 LEA track1B0.CC, tb1, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track1B1, tb1, param_B[1], RZ, 2; +--:-:-:-:1 LEA track2B0.CC, tb2, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track2B1, tb2, param_B[1], RZ, 2; +--:-:-:-:1 LEA track3B0.CC, tb3, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track3B1, tb3, param_B[1], RZ, 2; + +// writeAs = (tidAY*32 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*128 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 7; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readBs, readBs, tid96; +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidBY1, tidBY, 4; +--:-:-:-:1 IADD tidBY2, tidBY, 8; +--:-:-:-:1 IADD tidBY3, tidBY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txb, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidBY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidBY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidBY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidBY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.CI.128 load0B, [track0B]; +--:-:2:-:1 @P1 LDG.E.CI.128 load1B, [track1B]; +--:-:3:-:1 @P2 LDG.E.CI.128 load2B, [track2B]; +--:-:4:-:1 @P3 LDG.E.CI.128 load3B, [track3B]; +--:-:5:-:1 @P4 LDG.E.CI.128 loadA, [trackA]; + + + +--:-:6:-:1 @!P0 LDS.U.128 load0B, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 load1B, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.128 load2B, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load3B, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 loadA, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:1:-:1 @P0 LDG.E.CI load0B0, [track0B + 4x<0>]; +--:-:1:-:1 @P1 LDG.E.CI load0B1, [track0B + 4x<1>]; +--:-:1:-:1 @P2 LDG.E.CI load0B2, [track0B + 4x<2>]; +--:-:1:-:1 @P3 LDG.E.CI load0B3, [track0B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidBY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P5; + +--:-:2:-:1 @P0 LDG.E.CI load1B0, [track1B + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI load1B1, [track1B + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI load1B2, [track1B + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI load1B3, [track1B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:3:-:1 @P0 LDG.E.CI load2B0, [track2B + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI load2B1, [track2B + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load2B2, [track2B + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI load2B3, [track2B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:4:-:1 @P0 LDG.E.CI load3B0, [track3B + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI load3B1, [track3B + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI load3B2, [track3B + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load3B3, [track3B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P6; + +--:-:5:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:5:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:5:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P0, RZ, k, 15; +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 16, P0; + + + +21:-:-:-:1 STS.128 [writeBs + 4x<0*128>], load0B; +--:-:-:-:6 IADD track0B0.CC, track0B0, ldb16; +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +02:-:-:-:1 STS.128 [writeBs + 4x<4*128>], load1B; +--:-:-:-:6 IADD track1B0.CC, track1B0, ldb16; +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +04:-:-:-:1 STS.128 [writeBs + 4x<8*128>], load2B; +--:-:-:-:6 IADD track2B0.CC, track2B0, ldb16; +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +08:-:-:-:1 STS.128 [writeBs + 4x<12*128>], load3B; +--:-:-:-:6 IADD track3B0.CC, track3B0, ldb16; +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +10:-:-:-:1 STS [writeAs + 4x<0*32>], loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*32>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*32>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*32>], loadA3; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.128 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.128 load1B, [track1B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load2B, [track2B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load3B, [track3B]; +--:-:6:-:1 @P6 LDG.E.CI.128 loadA, [trackA]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI load0B0, [track0B + 4x<0>]; +--:-:3:-:1 @P2 LDG.E.CI load0B1, [track0B + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load0B2, [track0B + 4x<2>]; +--:-:3:-:1 @P2 LDG.E.CI load0B3, [track0B + 4x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI load1B0, [track1B + 4x<0>]; +--:-:4:-:1 @P3 LDG.E.CI load1B1, [track1B + 4x<1>]; +--:-:4:-:1 @P3 LDG.E.CI load1B2, [track1B + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load1B3, [track1B + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load2B0, [track2B + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load2B1, [track2B + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load2B2, [track2B + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load2B3, [track2B + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load3B0, [track3B + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load3B1, [track3B + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load3B2, [track3B + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load3B3, [track3B + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:6:-:1 @P6 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:6:-:1 @P6 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:6:-:1 @P6 LDG.E.CI loadA3, [trackA + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n", + j9c6 => "--:5:-:-:1 \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n", + j11c6 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<0*32>], loadA0;\n", + j11c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], loadA1;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], loadA2;\n", + j11c12 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<3*32>], loadA3;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0B0.CC, track0B0, ldb16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0B1, track0B1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, ldb16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2B0.CC, track2B0, ldb16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2B1, track2B1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3B0.CC, track3B0, ldb16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3B1, track3B1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackA0.CC, trackA0, 4x<16>;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackA1, trackA1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.128 load0B, [track0B];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.128 load1B, [track1B];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.128 load2B, [track2B];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.128 load3B, [track3B];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.128 loadA, [trackA];\n", + ) : + ( + + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI load2B0, [track2B + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E.CI load2B1, [track2B + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E.CI load2B2, [track2B + 4x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E.CI load2B3, [track2B + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_rnn_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_nn_rnn_128x32.sass new file mode 100644 index 0000000..21b493d --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_nn_rnn_128x32.sass @@ -0,0 +1,512 @@ +# Kernel: sgemm_nn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + 32*16*2> + szShareA : (128*16 + 32) + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_bias[0] : c[0x0][0x158] + param_bias[1] : c[0x0][0x15c] + param_lockAddr[0] : c[0x0][0x160] + param_lockAddr[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_xcutoff : c[0x0][0x170] + param_flags : c[0x0][0x174] + param_lda : c[0x0][0x178] + param_ldb8 : c[0x0][0x17c] + param_ldc : c[0x0][0x180] + param_m : c[0x0][0x184] + param_n : c[0x0][0x188] + param_k : c[0x0][0x18c] + param_ldaz : c[0x0][0x190] + param_ldbz : c[0x0][0x194] + param_ldcz : c[0x0][0x198] + param_loops : c[0x0][0x19c] + param_dimB : c[0x0][0x1a0] + param_dimC : c[0x0][0x1a4] + param_unrolling : c[0x0][0x1a8] + param_numBlks : c[0x0][0x1ac] + param_numAblks : c[0x0][0x1b0] + + + + + 32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, offsetB, shiftAX + 80-81 : baseB<0-1> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, time_step + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-61 : bias00y<0-1>, bias04y<0-1>, bias08y<0-1>, bias12y<0-1>, b0, b1, b2, b3, baseC<0-1> + 62-66 : blkId, nextBlk, lockAddr<0-1>, lockVal + 67-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags, xcutoff, offsetC, numBlk + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; + +--:-:-:-:1 MOV time_step, RZ; +--:-:-:-:1 MOV flags, param_flags; + +RNN_LOOP: + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb16, ldb, 6; +--:-:-:-:1 SHL lda32, lda, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +--:-:-:-:6 LOP.AND.NZ P0, RZ, flags, 4; +--:-:-:-:6 @P0 IADD offsetB, -time_step, param_unrolling; +--:-:-:-:6 @P0 IADD offsetB, offsetB, -1; +--:-:-:-:6 @!P0 MOV offsetB, time_step; + +// baseB = param_B + dimB * time_step +--:-:-:-:1 XMAD offsetB, offsetB, param_dimB, RZ; +--:-:-:-:1 LEA baseB0.CC, offsetB, param_B[0], 2; +--:-:-:-:1 LEA.HI.X baseB1, offsetB, param_B[1], RZ, 2; + +// tidAX = tid >> 2 +// tidAY = (tid & 3) << 2 +// shiftAX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidAX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidAY, tid3, 2; +--:-:-:-:1 SHL shiftAX, tid3, 3; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY) +02:-:-:-:1 ISCADD txa00, blkA, tidAX, 7; +--:-:-:-:1 IADD txa32, txa00, 32; +--:-:-:-:1 IADD txa64, txa00, 64; +--:-:-:-:1 IADD txa96, txa00, 96; + +--:-:-:-:1 XMAD.LO ta00, lda, txa00, tidAY, xmad_ta; +--:-:-:-:1 XMAD.LO2 ta00, ldaz, RZ, ta00; +--:-:-:-:1 IADD ta32, ta00, lda32; +--:-:-:-:1 IADD ta64, ta32, lda32; +--:-:-:-:1 IADD ta96, ta64, lda32; + +--:-:-:-:1 LEA track0A0.CC, ta00, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta00, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, ta32, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track1A1, ta32, param_A[1], RZ, 2; +--:-:-:-:1 LEA track2A0.CC, ta64, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track2A1, ta64, param_A[1], RZ, 2; +--:-:-:-:1 LEA track3A0.CC, ta96, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track3A1, ta96, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +--:-:-:-:1 XMAD.LO2 tb, ldbz, RZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, baseB0, 2; +--:-:-:-:1 LEA.HI.X trackB1, tb, baseB1, RZ, 2; + +// writeAs = (tidAY*128 + tidAX + shiftAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 IADD writeAs, writeAs, shiftAX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.128 load0A, [track0A]; +--:-:2:-:1 @P3 LDG.E.128 load1A, [track1A]; +--:-:3:-:1 @P4 LDG.E.128 load2A, [track2A]; +--:-:4:-:1 @P5 LDG.E.128 load3A, [track3A]; +--:-:5:-:1 @P6 LDG.E.128 loadB, [trackB]; + + + +--:-:6:-:1 @!P2 LDS.U.128 load0A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load1A, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 load2A, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 load3A, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.128 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidAY1, tidAY, 1; +--:-:-:-:1 IADD tidAY2, tidAY, 2; +--:-:-:-:1 IADD tidAY3, tidAY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:1:-:1 @P0 LDG.E load0A0, [track0A + 4x<0>]; +--:-:1:-:1 @P1 LDG.E load0A1, [track0A + 4x<1>]; +--:-:1:-:1 @P2 LDG.E load0A2, [track0A + 4x<2>]; +--:-:1:-:1 @P3 LDG.E load0A3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:2:-:1 @P0 LDG.E load1A0, [track1A + 4x<0>]; +--:-:2:-:1 @P1 LDG.E load1A1, [track1A + 4x<1>]; +--:-:2:-:1 @P2 LDG.E load1A2, [track1A + 4x<2>]; +--:-:2:-:1 @P3 LDG.E load1A3, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txa64, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P4; + +--:-:3:-:1 @P0 LDG.E load2A0, [track2A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E load2A1, [track2A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E load2A2, [track2A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E load2A3, [track2A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa96, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; + +--:-:4:-:1 @P0 LDG.E load3A0, [track3A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E load3A1, [track3A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E load3A2, [track3A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E load3A3, [track3A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P6; + +--:-:5:-:1 @P0 LDG.E loadB0, [trackB + 4x<0>]; +--:-:5:-:1 @P1 LDG.E loadB1, [trackB + 4x<1>]; +--:-:5:-:1 @P2 LDG.E loadB2, [trackB + 4x<2>]; +--:-:5:-:1 @P3 LDG.E loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txa00, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa32, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P1, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + + + +21:-:-:-:1 STS [writeAs + 4x<0*128 + 0*32>], load0A0; +--:-:-:-:0 IADD track0A0.CC, track0A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 0*32>], load0A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 0*32>], load0A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 0*32>], load0A3; + +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS [writeAs + 4x<0*128 + 1*32>], load1A0; +--:-:-:-:0 IADD track1A0.CC, track1A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 1*32>], load1A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 1*32>], load1A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 1*32>], load1A3; + +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS [writeAs + 4x<0*128 + 2*32>], load2A0; +--:-:-:-:0 IADD track2A0.CC, track2A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 2*32>], load2A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 2*32>], load2A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 2*32>], load2A3; + +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS [writeAs + 4x<0*128 + 3*32>], load3A0; +--:-:-:-:0 IADD track3A0.CC, track3A0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*128 + 3*32>], load3A1; +--:-:-:-:1 STS [writeAs + 4x<2*128 + 3*32>], load3A2; +--:-:-:-:4 STS [writeAs + 4x<3*128 + 3*32>], load3A3; + +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.128 load0A, [track0A]; +--:-:4:-:1 @P3 LDG.E.128 load1A, [track1A]; +--:-:5:-:1 @P4 LDG.E.128 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.128 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P2 LDG.E load0A0, [track0A + 4x<0>]; +--:-:3:-:1 @P2 LDG.E load0A1, [track0A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E load0A2, [track0A + 4x<2>]; +--:-:3:-:1 @P2 LDG.E load0A3, [track0A + 4x<3>]; + +--:-:4:-:1 @P3 LDG.E load1A0, [track1A + 4x<0>]; +--:-:4:-:1 @P3 LDG.E load1A1, [track1A + 4x<1>]; +--:-:4:-:1 @P3 LDG.E load1A2, [track1A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E load1A3, [track1A + 4x<3>]; + +--:-:5:-:1 @P4 LDG.E load2A0, [track2A + 4x<0>]; +--:-:5:-:1 @P4 LDG.E load2A1, [track2A + 4x<1>]; +--:-:5:-:1 @P4 LDG.E load2A2, [track2A + 4x<2>]; +--:-:5:-:1 @P4 LDG.E load2A3, [track2A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E load3A0, [track3A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E load3A1, [track3A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E load3A2, [track3A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E load3A3, [track3A + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P6 LDG.E loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P6 LDG.E loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P6 LDG.E loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n", + + j9c6 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n", + + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, 4x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, 4x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2A0.CC, track2A0, 4x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, 4x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.128 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.128 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P4 LDG.E.128 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.128 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E load0A0, [track0A + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E load0A1, [track0A + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E load0A2, [track0A + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E load0A3, [track0A + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E load1A0, [track1A + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E load1A1, [track1A + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E load1A2, [track1A + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E load1A3, [track1A + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P4 LDG.E load2A0, [track2A + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P4 LDG.E load2A1, [track2A + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P4 LDG.E load2A2, [track2A + 4x<2>];\n", + j10c3 => "--:-:5:-:1 \@P4 LDG.E load2A3, [track2A + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E load3A0, [track3A + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E load3A1, [track3A + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E load3A2, [track3A + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E load3A3, [track3A + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E loadB0, [trackB + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E loadB1, [trackB + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E loadB2, [trackB + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_nt_128x128.sass b/Kernel/SGEMM/Pascal/sgemm_nt_128x128.sass new file mode 100644 index 0000000..e01b4b5 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_nt_128x128.sass @@ -0,0 +1,339 @@ +# Kernel: sgemm_nt_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ blkA, blkB, blkZ, tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, tid127, txa, txb, xmad_ta, xmad_tb, tid128 + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-67 ~ k1, k2, k3 + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-111 : loadA<0-7>, loadB<0-7> + 112-115 : trackA<0-1>, trackB<0-1> + + 116-121 ~ writeS, k, tidY, ta, tb, loop + 122-127 ~ readAs, readBs, tid + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 LOP.AND tid1, tid, 1; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); + +--:-:-:-:1 MOV loop, RZ; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// tidY = tid1 << 2 +--:-:-:-:1 SHL tidY, tid1, 2; + +// tidX = tid >> 1 +01:-:-:-:1 SHR.U32 tidX, tid, 1; + +// trackA += 4 * ((blkA*128 + tidX) * lda + tidY) +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x2; + +// trackB += 4 * ((blkB*128 + tidX) * ldb + tidY) +04:-:-:-:1 ISCADD txb, blkB, tidX, 7; +--:-:-:-:1 XMAD.LO tb, ldb, txb, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = 4 * (128 * tidY + tidX) +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 SHL writeS, writeS, 2; + +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<128*8*2>; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + + + our $vec; + return $vec ? q{ + +// k must be multiple of 8 +--:-:2:-:1 @P5 LDG.E.CI.128 loadA0, [trackA + 4x<0>]; +--:-:2:-:1 @P5 LDG.E.CI.128 loadA4, [trackA + 4x<8>]; + +--:-:3:-:1 @P6 LDG.E.CI.128 loadB0, [trackB + 4x<0>]; +--:5:4:-:1 @P6 LDG.E.CI.128 loadB4, [trackB + 4x<8>]; + +--:-:-:-:1 @!P5 LDS.U.128 loadA0, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.128 loadB0, [addr_zero]; +--:-:-:-:1 @!P5 LDS.U.128 loadA4, [addr_zero]; +--:-:-:-:1 @!P6 LDS.U.128 loadB4, [addr_zero]; + +--:-:-:-:0 PSETP.AND.AND P1, PT, PT, PT, PT; + +22:-:-:-:1 STS [writeS + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeS + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeS + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeS + 4x<3*128>], loadA3; + +--:-:-:-:6 IADD trackA0.CC, trackA0, 4x<16>; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +24:-:-:-:1 STS [writeS + 4x< 8*128>], loadB0; +--:-:-:-:1 STS [writeS + 4x< 9*128>], loadB1; +--:-:-:-:1 STS [writeS + 4x<10*128>], loadB2; +--:-:-:-:1 STS [writeS + 4x<11*128>], loadB3; + +10:-:-:-:6 IADD trackB0.CC, trackB0, 4x<16>; +--:-:-:-:1 IADD.X trackB1, trackB1, RZ; + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:0 LOP.XOR writeS, writeS, 4x<128*8*2>; + + } : q{ + + +--:-:-:-:1 IADD k1, tidY, 1; +--:-:-:-:1 IADD k2, tidY, 2; +--:-:-:-:1 IADD k3, tidY, 3; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, k1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, k2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, k3, k, P6; + +--:-:3:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 7; + +02:-:-:-:1 STS [writeS + 4x<0*128>], loadA0; +--:-:-:-:1 STS [writeS + 4x<1*128>], loadA1; +--:-:-:-:1 STS [writeS + 4x<2*128>], loadA2; +--:-:-:-:1 STS [writeS + 4x<3*128>], loadA3; + +--:-:-:-:6 IADD trackA0.CC, trackA0, 4x<8>; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +04:-:-:-:1 STS [writeS + 4x< 8*128>], loadB0; +--:-:-:-:1 STS [writeS + 4x< 9*128>], loadB1; +--:-:-:-:1 STS [writeS + 4x<10*128>], loadB2; +--:-:-:-:1 STS [writeS + 4x<11*128>], loadB3; + +--:-:-:-:6 IADD trackB0.CC, trackB0, 4x<8>; +--:-:-:-:1 IADD.X trackB1, trackB1, RZ; + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<128*8*2>; + +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; + }; + + + + our $vec; + our $vec; + our @top = $vec ? + ("--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n") : + ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, 16, P5;\n"); + our %insert = + ( + ($vec ? + ( + j0c1 => "--:-:-:-:1 PSETP.AND.AND P1, PT, !P1, PT, PT;\n", + j0c13 => "--:-:-:-:1 PSETP.AND.AND P2, PT, P0, P1, P5;\n", + j0c14 => "--:-:-:-:1 PSETP.AND.AND P3, PT, P0, P1, P6;\n", + + j0c27 => "--:-:2:-:1 \@P2 LDG.E.CI.128 loadA0, [trackA + 4x<0>];\n", + j0c29 => "--:-:2:-:1 \@P2 LDG.E.CI.128 loadA4, [trackA + 4x<8>];\n", + + j0c31 => "--:-:3:-:1 \@P3 LDG.E.CI.128 loadB0, [trackB + 4x<0>];\n", + j0c33 => "08:5:4:-:1 \@P3 LDG.E.CI.128 loadB4, [trackB + 4x<8>];\n", + + j3c29 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<0*128>], loadA4;\n", + j3c31 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<1*128>], loadA5;\n", + j3c33 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<2*128>], loadA6;\n", + j3c35 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<3*128>], loadA7;\n", + + j4c29 => "--:-:-:-:1 \@!P1 STS [writeS + 4x< 8*128>], loadB4;\n", + j4c31 => "--:-:-:-:1 \@!P1 STS [writeS + 4x< 9*128>], loadB5;\n", + j4c33 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<10*128>], loadB6;\n", + j4c35 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<11*128>], loadB7;\n", + + j5c29 => "02:-:-:-:1 \@P1 STS [writeS + 4x<0*128>], loadA0;\n", + j5c31 => "--:-:-:-:1 \@P1 STS [writeS + 4x<1*128>], loadA1;\n", + j5c33 => "--:-:-:-:1 \@P1 STS [writeS + 4x<2*128>], loadA2;\n", + j5c35 => "--:-:-:-:1 \@P1 STS [writeS + 4x<3*128>], loadA3;\n", + + j6c29 => "04:-:-:-:1 \@P1 STS [writeS + 4x< 8*128>], loadB0;\n", + j6c31 => "--:-:-:-:1 \@P1 STS [writeS + 4x< 9*128>], loadB1;\n", + j6c33 => "--:-:-:-:1 \@P1 STS [writeS + 4x<10*128>], loadB2;\n", + j6c35 => "--:2:-:-:1 \@P1 STS [writeS + 4x<11*128>], loadB3;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 4x<16>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c46 => "10:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, 4x<16>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n", + ) : + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 16, P6;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j0c29 => "--:-:2:-:1 \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j0c31 => "--:-:2:-:1 \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j0c33 => "--:-:2:-:1 \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + + j0c35 => "--:-:3:-:1 \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j1c27 => "--:-:3:-:1 \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j1c29 => "--:-:3:-:1 \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j1c31 => "--:-:3:-:1 \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + + j5c29 => "02:-:-:-:1 \@P0 STS [writeS + 4x<0*128>], loadA0;\n", + j5c31 => "--:-:-:-:1 \@P0 STS [writeS + 4x<1*128>], loadA1;\n", + j5c33 => "--:-:-:-:1 \@P0 STS [writeS + 4x<2*128>], loadA2;\n", + j5c35 => "--:-:-:-:1 \@P0 STS [writeS + 4x<3*128>], loadA3;\n", + + j6c29 => "04:-:-:-:1 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n", + j6c31 => "--:-:-:-:1 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n", + j6c33 => "--:-:-:-:1 \@P0 STS [writeS + 4x<10*128>], loadB2;\n", + j6c35 => "--:2:-:-:1 \@P0 STS [writeS + 4x<11*128>], loadB3;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c46 => "--:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, 4x<8>;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ), + ), + + j6c63 => "02:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" . + "--:-:-:-:1 IADD32I k, k, -8;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_nt_32x128.sass b/Kernel/SGEMM/Pascal/sgemm_nt_32x128.sass new file mode 100644 index 0000000..339c825 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_nt_32x128.sass @@ -0,0 +1,483 @@ +# Kernel: sgemm_nt_32x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<(128*16 + 32)*2 + (32*16 + 32)*2> + szShareA : (32*16 + 32) + szShareB : (128*16 + 32) + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda : c[0x0][0x164] + param_ldb : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ tidX, lda, ldb, ldaz, ldbz, ldb32, tid1, tid3, tid96, ta, tb00, tb32, tb64, tb96, xmad_ta, xmad_tb, shiftX, tidY<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadA<0-3> + 84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3> + + 100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1> + + 110-120 ~ writeAs, writeBs, k, tidY, txa, txb00, txb32, txb64, txb96 + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkB, SR_CTAID.Z; +--:-:3:-:1 S2R blkA, SR_CTAID.Y; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda; +--:-:-:-:1 MOV ldb, param_ldb; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL ldb32, ldb, 5; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidX = tid >> 2 +// tidY = (tid & 3) << 2 +// shiftX = (tid & 3) << 3 +01:-:-:-:1 SHR.U32 tidX, tid, 2; +01:-:-:-:1 LOP.AND tid3, tid, 3; +--:-:-:-:1 SHL tidY, tid3, 2; +--:-:-:-:1 SHL shiftX, tid3, 3; + +// trackA += ((blkA*32 + tidX) * lda + tidAY) * 4 +04:-:-:-:1 ISCADD txa, blkA, tidX, 5; +--:-:-:-:1 XMAD.LO ta, lda, txa, tidY, xmad_ta; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 2; + +// trackB += ((blkB*128 + tidX) * ldb + tidY) * 4 +02:-:-:-:1 ISCADD txb00, blkB, tidX, 7; +--:-:-:-:1 IADD txb32, txb00, 32; +--:-:-:-:1 IADD txb64, txb00, 64; +--:-:-:-:1 IADD txb96, txb00, 96; + +--:-:-:-:1 XMAD.LO tb00, ldb, txb00, tidY, xmad_tb; +08:-:-:-:1 XMAD.LO2 tb00, ldbz, blkZ, tb00; +--:-:-:-:1 IADD tb32, tb00, ldb32; +--:-:-:-:1 IADD tb64, tb32, ldb32; +--:-:-:-:1 IADD tb96, tb64, ldb32; + +--:-:-:-:1 LEA track0B0.CC, tb00, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track0B1, tb00, param_B[1], RZ, 2; +--:-:-:-:1 LEA track1B0.CC, tb32, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track1B1, tb32, param_B[1], RZ, 2; +--:-:-:-:1 LEA track2B0.CC, tb64, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track2B1, tb64, param_B[1], RZ, 2; +--:-:-:-:1 LEA track3B0.CC, tb96, param_B[0], 2; +--:-:-:-:1 LEA.HI.X track3B1, tb96, param_B[1], RZ, 2; + +// writeAs = (tidY*32 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 5; +--:-:-:-:1 IADD writeAs, writeAs, shiftX; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidY*128 + tidX + shiftX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 7; +--:-:-:-:1 IADD writeBs, writeBs, shiftX; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 16; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + +// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4 +01:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 SHR.U32 tid96, tid96, 2; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 LOP.OR readBs, readBs, tid96; +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P2, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P4, PT, txb64, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txb96, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; + +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P2; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P3; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P5, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P6, PT, tidY, k, P6; + + +--:-:1:-:1 @P2 LDG.E.CI.128 load0B, [track0B]; +--:-:2:-:1 @P3 LDG.E.CI.128 load1B, [track1B]; +--:-:3:-:1 @P4 LDG.E.CI.128 load2B, [track2B]; +--:-:4:-:1 @P5 LDG.E.CI.128 load3B, [track3B]; +--:-:5:-:1 @P6 LDG.E.CI.128 loadA, [trackA]; + + + +--:-:6:-:1 @!P2 LDS.U.128 load0B, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load1B, [addr_zero]; +--:-:6:-:1 @!P4 LDS.U.128 load2B, [addr_zero]; +--:-:6:-:1 @!P5 LDS.U.128 load3B, [addr_zero]; +--:-:6:-:1 @!P6 LDS.U.128 loadA, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD tidY1, tidY, 1; +--:-:-:-:1 IADD tidY2, tidY, 2; +--:-:-:-:1 IADD tidY3, tidY, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P4; + +--:-:1:-:1 @P0 LDG.E.CI load0B0, [track0B + 4x<0>]; +--:-:1:-:1 @P1 LDG.E.CI load0B1, [track0B + 4x<1>]; +--:-:1:-:1 @P2 LDG.E.CI load0B2, [track0B + 4x<2>]; +--:-:1:-:1 @P3 LDG.E.CI load0B3, [track0B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0B0, RZ; +--:-:-:-:1 @!P1 MOV load0B1, RZ; +--:-:-:-:1 @!P2 MOV load0B2, RZ; +--:-:-:-:1 @!P3 MOV load0B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb32, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P5; + +--:-:2:-:1 @P0 LDG.E.CI load1B0, [track1B + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI load1B1, [track1B + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI load1B2, [track1B + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI load1B3, [track1B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1B0, RZ; +--:-:-:-:1 @!P1 MOV load1B1, RZ; +--:-:-:-:1 @!P2 MOV load1B2, RZ; +--:-:-:-:1 @!P3 MOV load1B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P4, PT, txb64, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P4; + +--:-:3:-:1 @P0 LDG.E.CI load2B0, [track2B + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI load2B1, [track2B + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load2B2, [track2B + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI load2B3, [track2B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2B0, RZ; +--:-:-:-:1 @!P1 MOV load2B1, RZ; +--:-:-:-:1 @!P2 MOV load2B2, RZ; +--:-:-:-:1 @!P3 MOV load2B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txb96, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P5; + +--:-:4:-:1 @P0 LDG.E.CI load3B0, [track3B + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI load3B1, [track3B + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI load3B2, [track3B + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load3B3, [track3B + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3B0, RZ; +--:-:-:-:1 @!P1 MOV load3B1, RZ; +--:-:-:-:1 @!P2 MOV load3B2, RZ; +--:-:-:-:1 @!P3 MOV load3B3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY1, k, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY2, k, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY3, k, P6; + +--:-:5:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:5:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:5:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P2, PT, txb00, param_n, PT; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb32, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3; +--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:1 LOP.AND.NZ P0, RZ, k, 15; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P0; + + + +21:-:-:-:1 STS [writeBs + 4x<0*128 + 0*32>], load0B0; +--:-:-:-:0 IADD track0B0.CC, track0B0, 4x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 0*32>], load0B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 0*32>], load0B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 0*32>], load0B3; + +--:-:-:-:0 IADD.X track0B1, track0B1, RZ; + +02:-:-:-:1 STS [writeBs + 4x<0*128 + 1*32>], load1B0; +--:-:-:-:0 IADD track1B0.CC, track1B0, 4x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 1*32>], load1B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 1*32>], load1B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 1*32>], load1B3; + +--:-:-:-:0 IADD.X track1B1, track1B1, RZ; + +04:-:-:-:1 STS [writeBs + 4x<0*128 + 2*32>], load2B0; +--:-:-:-:0 IADD track2B0.CC, track2B0, 4x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 2*32>], load2B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 2*32>], load2B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 2*32>], load2B3; + +--:-:-:-:0 IADD.X track2B1, track2B1, RZ; + +08:-:-:-:1 STS [writeBs + 4x<0*128 + 3*32>], load3B0; +--:-:-:-:0 IADD track3B0.CC, track3B0, 4x<16>; +--:-:-:-:1 STS [writeBs + 4x<1*128 + 3*32>], load3B1; +--:-:-:-:1 STS [writeBs + 4x<2*128 + 3*32>], load3B2; +--:-:-:-:4 STS [writeBs + 4x<3*128 + 3*32>], load3B3; + +--:-:-:-:0 IADD.X track3B1, track3B1, RZ; + +10:-:-:-:1 STS [writeAs + 4x<0*32>], loadA0; +--:-:-:-:0 IADD trackA0.CC, trackA0, 4x<16>; +--:-:-:-:1 STS [writeAs + 4x<1*32>], loadA1; +--:-:-:-:1 STS [writeAs + 4x<2*32>], loadA2; +--:-:-:-:1 STS [writeAs + 4x<3*32>], loadA3; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P2 LDG.E.CI.128 load0B, [track0B]; +--:-:4:-:1 @P3 LDG.E.CI.128 load1B, [track1B]; +--:-:5:-:1 @P4 LDG.E.CI.128 load2B, [track2B]; +--:-:5:-:1 @P5 LDG.E.CI.128 load3B, [track3B]; +--:-:6:-:1 @P6 LDG.E.CI.128 loadA, [trackA]; + } : q{ +--:-:3:-:1 @P2 LDG.E.CI load0B0, [track0B + 4x<0>]; +--:-:3:-:1 @P2 LDG.E.CI load0B1, [track0B + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load0B2, [track0B + 4x<2>]; +--:-:3:-:1 @P2 LDG.E.CI load0B3, [track0B + 4x<3>]; + +--:-:4:-:1 @P3 LDG.E.CI load1B0, [track1B + 4x<0>]; +--:-:4:-:1 @P3 LDG.E.CI load1B1, [track1B + 4x<1>]; +--:-:4:-:1 @P3 LDG.E.CI load1B2, [track1B + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load1B3, [track1B + 4x<3>]; + +--:-:5:-:1 @P4 LDG.E.CI load2B0, [track2B + 4x<0>]; +--:-:5:-:1 @P4 LDG.E.CI load2B1, [track2B + 4x<1>]; +--:-:5:-:1 @P4 LDG.E.CI load2B2, [track2B + 4x<2>]; +--:-:5:-:1 @P4 LDG.E.CI load2B3, [track2B + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load3B0, [track3B + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load3B1, [track3B + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load3B2, [track3B + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load3B3, [track3B + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:6:-:1 @P6 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:6:-:1 @P6 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:6:-:1 @P6 LDG.E.CI loadA3, [trackA + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 1; + our $shiftBX = 1; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 0*32>], load0B0;\n", + j3c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 0*32>], load0B1;\n", + j3c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 0*32>], load0B2;\n", + j3c12 => "--:3:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 0*32>], load0B3;\n", + + j5c6 => "08:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 1*32>], load1B0;\n", + j5c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 1*32>], load1B1;\n", + j5c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 1*32>], load1B2;\n", + j5c12 => "--:4:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 1*32>], load1B3;\n", + + j7c6 => "10:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 2*32>], load2B0;\n", + j7c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 2*32>], load2B1;\n", + j7c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 2*32>], load2B2;\n", + j7c12 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 2*32>], load2B3;\n", + + j9c6 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<0*128 + 3*32>], load3B0;\n", + j9c8 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<1*128 + 3*32>], load3B1;\n", + j9c10 => "--:-:-:-:1 \@P0 STS [writeBs + 4x<2*128 + 3*32>], load3B2;\n", + j9c12 => "--:5:-:-:1 \@P0 STS [writeBs + 4x<3*128 + 3*32>], load3B3;\n", + + j11c6 => "20:-:-:-:1 \@P0 STS [writeAs + 4x<0*32>], loadA0;\n", + j11c8 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<1*32>], loadA1;\n", + j11c10 => "--:-:-:-:1 \@P0 STS [writeAs + 4x<2*32>], loadA2;\n", + j11c12 => "--:6:-:-:1 \@P0 STS [writeAs + 4x<3*32>], loadA3;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0B0.CC, track0B0, 4x<16>;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0B1, track0B1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1B0.CC, track1B0, 4x<16>;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1B1, track1B1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P4 IADD track2B0.CC, track2B0, 4x<16>;\n", + j7c13 => "--:-:-:-:1 \@P4 IADD.X track2B1, track2B1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3B0.CC, track3B0, 4x<16>;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3B1, track3B1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackA0.CC, trackA0, 4x<16>;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackA1, trackA1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j7c14 => "--:-:-:-:1 ISETP.GE.AND P4, PT, k, 32, P4;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.128 load0B, [track0B];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.128 load1B, [track1B];\n", + j9c29 => "10:-:5:-:1 \@P4 LDG.E.CI.128 load2B, [track2B];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.128 load3B, [track3B];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.128 loadA, [trackA];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P4 LDG.E.CI load2B0, [track2B + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P4 LDG.E.CI load2B1, [track2B + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P4 LDG.E.CI load2B2, [track2B + 4x<2>];\n", + j10c3 => "--:-:-:-:1 \@P4 LDG.E.CI load2B3, [track2B + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_rnn_bprop_common_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_rnn_bprop_common_128x32.sass new file mode 100644 index 0000000..9f5919a --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_rnn_bprop_common_128x32.sass @@ -0,0 +1,362 @@ +# sgemm_common_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; +--:-:-:-:5 MOV xcutoff, param_xcutoff; + +--:-:-:-:6 LOP.AND.NZ P0, RZ, flags, 4; +--:-:-:-:6 @P0 IADD offsetC, -time_step, param_unrolling; +--:-:-:-:6 @P0 IADD offsetC, offsetC, -1; +--:-:-:-:6 @!P0 MOV offsetC, time_step; + +// baseH = param_H + dimH * time_step +--:-:-:-:1 XMAD offsetH, offsetC, param_dimH, RZ; +--:-:-:-:1 LEA baseH0.CC, offsetH, param_H[0], 2; +--:-:-:-:1 LEA.HI.X baseH1, offsetH, param_H[1], RZ, 2; + +// baseC = param_C + dimC * time_step +--:-:-:-:1 XMAD offsetC, offsetC, param_dimC, RZ; +--:-:-:-:1 LEA baseC0.CC, offsetC, param_C[0], 2; +--:-:-:-:1 LEA.HI.X baseC1, offsetC, param_C[1], RZ, 2; + +// writeCs = (readAs / 4) * 32 + readBs; +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 2; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +--:-:-:-:1 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, RZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, baseC0, 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, baseC1, RZ, 2; + +// Apply relu +--:-:-:-:0 LOP.AND.NZ P4, RZ, flags, 2; +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + +--:-:-:-:1 MOV ldh1, param_ldh; + +// H += (ldh*cy + cx) * 4 +--:-:-:-:1 XMAD.LO ci, ldh1, cy00, cx, xmad_c; +--:-:-:-:1 LEA H00y0.CC, ci, baseH0, 2; +--:-:-:-:1 LEA.HI.X H00y1, ci, baseH1, RZ, 2; + +--:-:-:-:1 SHL ldh1, ldh1, 2; +--:-:-:-:1 SHL ldh4, ldh1, 2; +--:-:-:-:1 SHL ldh60, ldh1, 6; +--:-:-:-:1 IADD ldh60, ldh60, -ldh4; + + +--:-:-:-:4 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 MOV d0, RZ; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:4 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 MOV d1, RZ; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:3 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 MOV d2, RZ; +--:-:-:-:1 MOV d3, RZ; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:1 IADD.X C12y1, C08y1, RZ; + +--:-:-:-:6 IADD H04y0.CC, H00y0, ldh4; +--:-:-:-:1 IADD.X H04y1, H00y1, RZ; +--:-:-:-:6 IADD H08y0.CC, H04y0, ldh4; +--:-:-:-:1 IADD.X H08y1, H04y1, RZ; +--:-:-:-:6 IADD H12y0.CC, H08y0, ldh4; +--:-:-:-:0 IADD.X H12y1, H08y1, RZ; + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" . + "--:-:-:-:6 IADD H00y0.CC, H00y0, ldh60;\n" . + "--:-:-:-:1 IADD.X H00y1, H00y1, RZ;\n" . + "--:-:-:-:6 IADD H04y0.CC, H04y0, ldh60;\n" . + "--:-:-:-:1 IADD.X H04y1, H04y1, RZ;\n" . + "--:-:-:-:6 IADD H08y0.CC, H08y0, ldh60;\n" . + "--:-:-:-:1 IADD.X H08y1, H08y1, RZ;\n" . + "--:-:-:-:6 IADD H12y0.CC, H12y0, ldh60;\n" . + "--:-:-:-:1 IADD.X H12y1, H12y1, RZ;\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:1 MOV lockAddr0, param_lockAddr[0]; +--:-:-:-:1 MOV lockAddr1, param_lockAddr[1]; + +// time_step = time_step + 1 +--:-:-:-:6 IADD time_step, time_step, 1; +--:-:-:-:1 ISETP.LT.AND P0, PT, time_step, param_unrolling, PT; + +// Synchronize all blocks +--:-:-:-:1 ISETP.NE.AND P1, PT, tid, RZ, PT; +--:-:-:-:6 XMAD blkId, blkB, param_numAblks, blkA; +--:-:-:-:6 IADD nextBlk, blkId, 1; +--:-:-:-:8 ISETP.EQ.OR P2, PT, nextBlk, param_numBlks, P1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 SSY SSY_TARGET1; +--:-:-:-:d @P1 SYNC; +--:-:-:-:6 @P2 MOV nextBlk, RZ; + +SPINLOCK1: +--:-:1:Y:2 ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk; +01:-:-:Y:d ISETP.NE.AND P1, PT, lockVal, blkId, PT; +--:-:-:-:d @P1 BRA.U SPINLOCK1; +--:-:-:-:d SYNC; + +SSY_TARGET1: +--:-:-:-:1 SSY SSY_TARGET2; +--:-:-:-:d @P2 SYNC; +--:-:-:-:6 MOV nextBlk, RZ; + +SPINLOCK2: +--:-:1:Y:2 ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk; +01:-:-:Y:d ISETP.NE.AND P1, PT, lockVal, RZ, PT; +--:-:-:-:5 @P1 BRA.U SPINLOCK2; +--:-:-:-:d SYNC; + +SSY_TARGET2: +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:f MEMBAR.GL; + +//Loop back to beginning of GEMM loop +--:-:-:Y:5 @P0 BRA.U RNN_LOOP; + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 LDG.E h0, [H00y]; +--:-:-:-:1 LDG.E h1, [H04y]; +--:-:-:-:1 LDG.E h2, [H08y]; +--:-:-:-:1 LDG.E h3, [H12y]; + + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y]; +--:-:2:-:1 @P1 LDG.E d1, [C04y]; +--:-:3:-:1 @P2 LDG.E d2, [C08y]; +--:-:4:-:1 @P3 LDG.E d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:3 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*32>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*32>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*32>]; + + +--:-:-:-:1 P2R predSave, PR, RZ, 0x0f; + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, d3, beta, c3; + +//Bprop for activation: Rectlinclip + +--:-:-:-:1 FSETP.LT.AND P0, PT, RZ, h0, PT; +--:-:-:-:1 FSETP.LT.AND P1, PT, RZ, h1, PT; +--:-:-:-:1 FSETP.LT.AND P2, PT, RZ, h2, PT; +--:-:-:-:1 FSETP.LT.AND P3, PT, RZ, h3, PT; +--:-:-:-:1 FSETP.LT.AND P0, PT, h0, xcutoff, P0; +--:-:-:-:1 FSETP.LT.AND P1, PT, h1, xcutoff, P1; +--:-:-:-:1 FSETP.LT.AND P2, PT, h2, xcutoff, P2; +--:-:-:-:1 FSETP.LT.AND P3, PT, h3, xcutoff, P3; +--:-:-:-:1 SEL c0, c0, RZ, P0; +--:-:-:-:1 SEL c1, c1, RZ, P1; +--:-:-:-:1 SEL c2, c2, RZ, P2; +--:-:-:-:1 SEL c3, c3, RZ, P3; + + +--:-:-:Y:d R2P PR, predSave, 0x0f; + +--:1:-:-:1 @P0 STG.E [C00y], c0; +--:2:-:-:1 @P1 STG.E [C04y], c1; +--:3:-:-:1 @P2 STG.E [C08y], c2; +--:4:-:-:1 @P3 STG.E [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:1 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:6 IADD H00y0.CC, H00y0, ldh1; +--:-:-:-:1 IADD.X H00y1, H00y1, RZ; +--:-:-:-:6 IADD H04y0.CC, H04y0, ldh1; +--:-:-:-:1 IADD.X H04y1, H04y1, RZ; +--:-:-:-:6 IADD H08y0.CC, H08y0, ldh1; +--:-:-:-:1 IADD.X H08y1, H08y1, RZ; +--:-:-:-:6 IADD H12y0.CC, H12y0, ldh1; +--:-:-:-:0 IADD.X H12y1, H12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/sgemm_rnn_common_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_rnn_common_128x32.sass new file mode 100644 index 0000000..67bda6f --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_rnn_common_128x32.sass @@ -0,0 +1,348 @@ +# sgemm_common_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--:-:1:-:1 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00 + 0*8>]; +--:-:1:-:1 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Bx0, [readBs + 4x<1*32 + 00 + 0*8>]; +--:-:2:-:1 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>]; + +LOOP: + + + + our @top; + our %insert; + our $shiftAX; + our $shiftBX; + + my @cOrder; + my @swirl = ([0,2],[1,2],[1,0],[0,0]); + my @y = (0,1,4,5); + foreach my $x (0,2) + { + foreach my $y (@y) + { + push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; + } + @y = reverse @y; + } + + my $out = join '', @top; + + foreach my $j (0 .. 15) + { + my $barrier = $j & 1 ? 2 : 1; + my $rsPred = $j >= 14 ? '@P0' : ' '; + my $loadReg = ($j + 2) & 3; + my $shareLine = ($j + 2) & 15; + my $shiftA = $shiftAX ? $shareLine >> 2 : 0; + my $shiftB = $shiftBX ? $shareLine >> 2 : 0; + my $compute = $j & 3; + + + $insert{"j${j}c0"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + $insert{"j${j}c2"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB; + $insert{"j${j}c4"} = sprintf "--:-:%d:-:1 %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA; + + foreach my $c (0 .. 31) + { + my ($x,$y) = @{$cOrder[$c]}; + + my $ins = $insert{"j${j}c$c"} || ''; + + my $wait = $c == 0 ? "0$barrier" : '--'; + + my $stall = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1; + + my $yield = $c == 16 && $stall ? 'Y' : '-'; + + my $ctrl = "$wait:-:-:$yield:$stall"; + + $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x,$y, $compute,$x, $compute,$y, $x,$y, $ins; + } + } + return $out; + + + + + +--:-:-:-:1 MOV alpha, param_alpha; +--:-:-:-:1 MOV beta, param_beta; +--:-:-:-:1 MOV flags, param_flags; +--:-:-:-:5 MOV xcutoff, param_xcutoff; + +--:-:-:-:6 LOP.AND.NZ P0, RZ, flags, 4; +--:-:-:-:6 @P0 IADD offsetC, -time_step, param_unrolling; +--:-:-:-:6 @P0 IADD offsetC, offsetC, -1; +--:-:-:-:6 @!P0 MOV offsetC, time_step; + +// baseC = param_C + dimC * time_step +--:-:-:-:1 XMAD offsetC, offsetC, param_dimC, RZ; +--:-:-:-:1 LEA baseC0.CC, offsetC, param_C[0], 2; +--:-:-:-:1 LEA.HI.X baseC1, offsetC, param_C[1], RZ, 2; + +// writeCs = (readAs / 4) * 32 + readBs; +--:-:-:-:1 ISETP.GT.AND P0, PT, swapBuf, RZ, PT; +--:-:-:-:1 IADD readBs, readBs, -4x; +--:-:-:-:1 @P0 IADD readAs, readAs, -swapBuf; +--:-:-:-:1 @P0 IADD readBs, readBs, -swapBuf; +--:-:-:-:1 ISCADD writeCs, readAs, readBs, 3; + +// readCs = ((tid & 96) << 2) | (tid & 31) << 2; +--:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 LOP.AND tid96, tid, 96; +--:-:-:-:1 ISCADD readCs, tid96, tid31, 2; +--:-:-:-:1 SHL readCs, readCs, 2; + +// cx = blkB*32 + tid31; +--:-:-:-:1 ISCADD cx, blkB, tid31, 5; + +// cy = blkA*128 + (tid96 >> 1) +--:-:-:-:1 SHR.U32 cy00, tid96, 1; +--:-:-:-:1 ISCADD cy00, blkA, cy00, 7; + +// C += (cy*ldc + cx) * 4; +// C += (ldcz*blockZ + ldc*cy + cx00) * 4; +--:-:-:-:1 MOV ldc, param_ldc; +--:-:-:-:1 MOV ldcz, param_ldcz; +--:-:-:-:1 XMAD.LO ci, ldc, cy00, cx, xmad_c; +--:-:-:-:1 XMAD.LO2 ci, ldcz, RZ, ci; +--:-:-:-:1 LEA C00y0.CC, ci, baseC0, 2; +--:-:-:-:1 LEA.HI.X C00y1, ci, baseC1, RZ, 2; + +// Apply relu +--:-:-:-:0 LOP.AND.NZ P4, RZ, flags, 2; +// cx < n +--:-:-:-:1 ISETP.LT.AND P6, PT, cx, param_n, PT; +// beta != 0 +--:-:-:-:1 ISETP.NE.AND P5, PT, beta, RZ, P6; + + +--:-:-:-:1 SHL ldc1, ldc, 2; +--:-:-:-:1 SHL ldc4, ldc, 4; +--:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; + + +--:-:-:-:4 IADD C04y0.CC, C00y0, ldc4; +--:-:-:-:1 MOV d0, RZ; +--:-:-:-:1 IADD cy04, cy00, 4; +--:-:-:-:1 IADD.X C04y1, C00y1, RZ; +--:-:-:-:4 IADD C08y0.CC, C04y0, ldc4; +--:-:-:-:1 MOV d1, RZ; +--:-:-:-:1 IADD cy08, cy00, 8; +--:-:-:-:1 IADD.X C08y1, C04y1, RZ; +--:-:-:-:3 IADD C12y0.CC, C08y0, ldc4; +--:-:-:-:1 MOV d2, RZ; +--:-:-:-:1 MOV d3, RZ; +--:-:-:-:1 IADD cy12, cy00, 12; +--:-:-:-:1 IADD.X C12y1, C08y1, RZ; + + +// bias_track = bias + cy +--:-:-:-:1 LEA bias00y0.CC, cy00, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X bias00y1, cy00, param_bias[1], RZ, 2; +--:-:-:-:1 LEA bias04y0.CC, cy04, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X bias04y1, cy04, param_bias[1], RZ, 2; +--:-:-:-:1 LEA bias08y0.CC, cy08, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X bias08y1, cy08, param_bias[1], RZ, 2; +--:-:-:-:1 LEA bias12y0.CC, cy12, param_bias[0], 2; +--:-:-:-:1 LEA.HI.X bias12y1, cy12, param_bias[1], RZ, 2; + + +--:-:-:-:5 BAR.SYNC 0; + + + + my $out; + foreach my $y (0..7) + { + $out .= + "--:-:-:-:5 IADD C00y0.CC, C00y0, ldc60;\n" . + "--:-:-:-:1 IADD cy00, cy00, 60;\n" . + "--:-:-:-:1 IADD.X C00y1, C00y1, RZ;\n" . + "--:-:-:-:5 IADD C04y0.CC, C04y0, ldc60;\n" . + "--:-:-:-:1 IADD cy04, cy04, 60;\n" . + "--:-:-:-:1 IADD.X C04y1, C04y1, RZ;\n" . + "--:-:-:-:5 IADD C08y0.CC, C08y0, ldc60;\n" . + "--:-:-:-:1 IADD cy08, cy08, 60;\n" . + "--:-:-:-:1 IADD.X C08y1, C08y1, RZ;\n" . + "--:-:-:-:5 IADD C12y0.CC, C12y0, ldc60;\n" . + "--:-:-:-:1 IADD cy12, cy12, 60;\n" . + "--:-:-:-:1 IADD.X C12y1, C12y1, RZ;\n\n" . + "--:-:-:-:6 IADD bias00y0.CC, bias00y0, 240;\n" . + "--:-:-:-:1 IADD.X bias00y1, bias00y1, RZ;\n" . + "--:-:-:-:6 IADD bias04y0.CC, bias04y0, 240;\n" . + "--:-:-:-:1 IADD.X bias04y1, bias04y1, RZ;\n" . + "--:-:-:-:6 IADD bias08y0.CC, bias08y0, 240;\n" . + "--:-:-:-:1 IADD.X bias08y1, bias08y1, RZ;\n" . + "--:-:-:-:6 IADD bias12y0.CC, bias12y0, 240;\n" . + "--:-:-:-:1 IADD.X bias12y1, bias12y1, RZ;\n" if $y == 4; + + $out .= sprintf( + "--:-:-:-:1 FMUL c0, cx0y%d, alpha;\n" . + "--:-:-:-:1 FMUL c1, cx1y%d, alpha;\n" . + "--:-:-:-:1 FMUL c2, cx2y%d, alpha;\n" . + "--:-:-:-:0 FMUL c3, cx3y%d, alpha;\n", + ($y) x 4); + + $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; + } + return $out; + + + +--:-:-:-:1 MOV lockAddr0, param_lockAddr[0]; +--:-:-:-:1 MOV lockAddr1, param_lockAddr[1]; + +// time_step = time_step + 1 +--:-:-:-:6 IADD time_step, time_step, 1; +--:-:-:-:1 ISETP.LT.AND P0, PT, time_step, param_unrolling, PT; + +// Synchronize all blocks +--:-:-:-:1 ISETP.NE.AND P1, PT, tid, RZ, PT; +--:-:-:-:6 XMAD blkId, blkB, param_numAblks, blkA; +--:-:-:-:6 IADD nextBlk, blkId, 1; +--:-:-:-:8 ISETP.EQ.OR P2, PT, nextBlk, param_numBlks, P1; + +--:-:-:-:5 BAR.SYNC 0; + +--:-:-:-:1 SSY SSY_TARGET1; +--:-:-:-:d @P1 SYNC; +--:-:-:-:6 @P2 MOV nextBlk, RZ; + +SPINLOCK1: +--:-:1:Y:2 ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk; +01:-:-:Y:d ISETP.NE.AND P1, PT, lockVal, blkId, PT; +--:-:-:-:d @P1 BRA.U SPINLOCK1; +--:-:-:-:d SYNC; + +SSY_TARGET1: +--:-:-:-:1 SSY SSY_TARGET2; +--:-:-:-:d @P2 SYNC; +--:-:-:-:6 MOV nextBlk, RZ; + +SPINLOCK2: +--:-:1:Y:2 ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk; +01:-:-:Y:d ISETP.NE.AND P1, PT, lockVal, RZ, PT; +--:-:-:-:5 @P1 BRA.U SPINLOCK2; +--:-:-:-:d SYNC; + +SSY_TARGET2: +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:f MEMBAR.GL; + +//Loop back to beginning of GEMM loop +--:-:-:Y:5 @P0 BRA.U RNN_LOOP; + +--:-:-:-:5 EXIT; + +STORE_C: + + +--:-:-:-:1 LDG.E.CI b0, [bias00y]; +--:-:-:-:1 LDG.E.CI b1, [bias04y]; +--:-:-:-:1 LDG.E.CI b2, [bias08y]; +--:-:-:-:1 LDG.E.CI b3, [bias12y]; + + + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P5; + +--:-:1:-:1 @P0 LDG.E d0, [C00y]; +--:-:2:-:1 @P1 LDG.E d1, [C04y]; +--:-:3:-:1 @P2 LDG.E d2, [C08y]; +--:-:4:-:1 @P3 LDG.E d3, [C12y]; +--:-:-:-:1 @!P0 MOV d0, RZ; +--:-:-:-:1 @!P1 MOV d1, RZ; +--:-:-:-:1 @!P2 MOV d2, RZ; +--:-:-:-:1 @!P3 MOV d3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, cy00, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, cy04, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, cy08, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, cy12, param_m, P6; + +--:-:-:-:1 IADD cy00, cy00, 1; +--:-:-:-:1 IADD cy04, cy04, 1; +--:-:-:-:1 IADD cy08, cy08, 1; +--:-:-:-:3 IADD cy12, cy12, 1; + +--:-:-:-:1 @P4 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c2, c2, RZ, !PT; +--:-:-:-:1 @P4 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 STS.128 [writeCs], c0; +--:-:-:-:1 LDS c0, [readCs + 4x<0*32>]; +--:-:5:-:1 LDS c1, [readCs + 4x<1*32>]; +--:-:-:-:1 LDS c2, [readCs + 4x<2*32>]; +--:-:6:-:1 LDS c3, [readCs + 4x<3*32>]; + + +11:-:-:-:1 @P5 FFMA c0, d0, beta, c0; +02:-:-:-:1 @P5 FFMA c1, d1, beta, c1; +24:-:-:-:1 @P5 FFMA c2, d2, beta, c2; +08:-:-:-:3 @P5 FFMA c3, d3, beta, c3; + +--:-:-:-:1 FADD c0, c0, b0; +--:-:-:-:1 FADD c1, c1, b1; +--:-:-:-:1 FADD c2, c2, b2; +--:-:-:-:3 FADD c3, c3, b3; + +//Activation function: Rectlinclip + +--:-:-:-:1 FMNMX c0, c0, RZ, !PT; +--:-:-:-:1 FMNMX c1, c1, RZ, !PT; +--:-:-:-:1 FMNMX c2, c2, RZ, !PT; +--:-:-:-:3 FMNMX c3, c3, RZ, !PT; + +--:-:-:-:1 FMNMX c0, c0, xcutoff, PT; +--:-:-:-:1 FMNMX c1, c1, xcutoff, PT; +--:-:-:-:1 FMNMX c2, c2, xcutoff, PT; +--:-:-:-:3 FMNMX c3, c3, xcutoff, PT; + + +--:1:-:-:1 @P0 STG.E [C00y], c0; +--:2:-:-:1 @P1 STG.E [C04y], c1; +--:3:-:-:1 @P2 STG.E [C08y], c2; +--:4:-:-:1 @P3 STG.E [C12y], c3; + +01:-:-:-:6 IADD C00y0.CC, C00y0, ldc1; +--:-:-:-:1 IADD.X C00y1, C00y1, RZ; +02:-:-:-:6 IADD C04y0.CC, C04y0, ldc1; +--:-:-:-:1 IADD.X C04y1, C04y1, RZ; +04:-:-:-:6 IADD C08y0.CC, C08y0, ldc1; +--:-:-:-:1 IADD.X C08y1, C08y1, RZ; +08:-:-:-:6 IADD C12y0.CC, C12y0, ldc1; +--:-:-:-:1 IADD.X C12y1, C12y1, RZ; + +--:-:-:-:6 IADD bias00y0.CC, bias00y0, 4; +--:-:-:-:1 IADD.X bias00y1, bias00y1, RZ; +--:-:-:-:6 IADD bias04y0.CC, bias04y0, 4; +--:-:-:-:1 IADD.X bias04y1, bias04y1, RZ; +--:-:-:-:6 IADD bias08y0.CC, bias08y0, 4; +--:-:-:-:1 IADD.X bias08y1, bias08y1, RZ; +--:-:-:-:6 IADD bias12y0.CC, bias12y0, 4; +--:-:-:-:0 IADD.X bias12y1, bias12y1, RZ; + +--:-:-:-:5 RET; diff --git a/Kernel/SGEMM/Pascal/sgemm_tn_128x128.sass b/Kernel/SGEMM/Pascal/sgemm_tn_128x128.sass new file mode 100644 index 0000000..5099001 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_tn_128x128.sass @@ -0,0 +1,279 @@ +# Kernel: sgemm_tn_128x128 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*4> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, tid1, tid7, tidX, blk, tid31, tid128 + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-95 ~ x<1-3>, y<1-3> + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-103 : loadA<0-3>, loadB<0-3> + + 104-107 : trackA<0-1>, trackB<0-1> + + 108-121 ~ writeS, lda8, k, tidY, txa, txb, ta, tb, loop + 122-127 ~ readAs, readBs, tid + + 64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV loop, RZ; +--:-:-:-:1 STS.128 [addr_zero], RZ; + + join('', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15); + + +// tidX = (tid & 31) << 2 +// tidY = (tid >> 5) & 7 +01:-:-:-:1 LOP.AND tid31, tid, 31; +--:-:-:-:1 SHL tidX, tid31, 2; +--:-:-:-:1 BFE.U32 tidY, tid, 0x305; // 3 bits at position 5 + +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 5; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + +// trackA += (blkA*128 + lda*tidY + tidX) * 2 +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; + +// trackB += (blkB*128 + ldb*tidY + tidX) * 2 +04:-:-:-:1 ISCADD txb, blkB, tidX, 7; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// writeS = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeS, tidY, tidX, 7; +--:-:-:-:1 SHL writeS, writeS, 2; +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<128*8*2>; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; + + +// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096; +--:-:-:-:1 LOP.AND tid128, tid, 128; +--:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 SHR.U32 readBs, tid128, 4; +--:-:-:-:1 LOP.OR readBs, readBs, tid7; +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:1 LOP.AND.NZ P4, RZ, k, 7; +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 8, P4; + +// doLoad = tidY < k && txa|txb < n|m +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P6; + +--:-:2:-:1 @P2 LDG.E.CI.128 loadA, [trackA]; +--:-:3:-:1 @P3 LDG.E.CI.128 loadB, [trackB]; + +--:-:5:-:1 @!P2 LDS.U.128 loadA, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 loadB, [addr_zero]; + + // Vec 4 and scalar loads + } : q{ + +// doLoadA = tidY < k && txa < m +// doLoadB = tidY < k && txb < n +--:-:-:-:1 IADD x1, txa, 1; +--:-:-:-:1 IADD x2, txa, 2; +--:-:-:-:1 IADD x3, txa, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_m, P0; + +--:-:2:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 IADD y1, txb, 1; +--:-:-:-:1 IADD y2, txb, 2; +--:-:-:-:1 IADD y3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_n, P0; + +--:-:3:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 8, PT; + }; + + + + +12:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadA0; + +--:-:-:-:6 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +24:-:-:-:1 STS.128 [writeS + 4x<8*128>], loadB0; + +--:-:-:-:1 IADD trackB0.CC, trackB0, param_ldb8; + +--:-:-:-:1 LOP.XOR readAs, readAs, 4x<128*8*2>; +--:-:-:-:0 LOP.XOR readBs, readBs, 4x<128*8*2>; +01:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 LOP.XOR writeS, writeS, 4x<128*8*2>; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P5;\n"); + + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P6;\n", + j0c8 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + ($vec ? + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI.128 loadA, [trackA];\n", + j0c13 => "--:-:3:-:1 \@P3 LDG.E.CI.128 loadB, [trackB];\n", + ) : + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j0c29 => "--:-:2:-:1 \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j0c31 => "--:-:2:-:1 \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j0c33 => "--:-:2:-:1 \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + + j0c35 => "--:-:3:-:1 \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j1c29 => "--:-:3:-:1 \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j1c31 => "--:-:3:-:1 \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j1c33 => "--:-:3:-:1 \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j5c33 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*128>], loadA0;\n", + + j5c46 => "--:-:-:-:1 \@P2 IADD trackA0.CC, trackA0, param_lda8;\n", + j5c54 => "--:-:-:-:1 \@P2 IADD.X trackA1, trackA1, RZ;\n", + + j6c33 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<8*128>], loadB0;\n", + + j6c46 => "--:-:-:-:1 \@P3 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c54 => "--:-:-:-:1 \@P3 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" . + "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" . + "--:-:-:-:1 IADD32I k, k, -8;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_tn_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_tn_128x32.sass new file mode 100644 index 0000000..0b9ffc1 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_tn_128x32.sass @@ -0,0 +1,447 @@ +# Kernel: sgemm_tn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + 32*16*2> + szShareA : 128*16 + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 5; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL lda16, lda, 6; +--:-:-:-:1 SHL ldb16, ldb, 6; +--:-:-:-:1 SHL lda4, lda, 2; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +// tidAX = (tid & 31) << 2 +// tidAY = (tid >> 5) +01:-:-:-:1 LOP.AND tidAX, tid, 31; +--:-:-:-:1 SHL tidAX, tidAX, 2; +--:-:-:-:1 SHR.U32 tidAY, tid, 5; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += (blkA*128 + tidAX + lda*tidAY) * 4 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO2 ta0, lda, tidAY, txa; +08:-:-:-:1 XMAD.LO2 ta0, ldaz, blkZ, ta0; +--:-:-:-:1 IADD ta1, ta0, lda4; +--:-:-:-:1 IADD ta2, ta1, lda4; +--:-:-:-:1 IADD ta3, ta2, lda4; + +--:-:-:-:1 LEA track0A0.CC, ta0, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta0, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, ta1, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track1A1, ta1, param_A[1], RZ, 2; +--:-:-:-:1 LEA track2A0.CC, ta2, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track2A1, ta2, param_A[1], RZ, 2; +--:-:-:-:1 LEA track3A0.CC, ta3, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track3A1, ta3, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 2; + +// writeAs = (tidAY*128 + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidAY1, tidAY, 4; +--:-:-:-:1 IADD tidAY2, tidAY, 8; +--:-:-:-:1 IADD tidAY3, tidAY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.CI.128 load0A, [track0A]; +--:-:2:-:1 @P1 LDG.E.CI.128 load1A, [track1A]; +--:-:3:-:1 @P2 LDG.E.CI.128 load2A, [track2A]; +--:-:4:-:1 @P3 LDG.E.CI.128 load3A, [track3A]; +--:-:5:-:1 @P4 LDG.E.CI.128 loadB, [trackB]; + + + +--:-:6:-:1 @!P0 LDS.U.128 load0A, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 load1A, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.128 load2A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load3A, [addr_zero]; +--:-:6:-:2 @!P4 LDS.U.128 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD txa1, txa, 1; +--:-:-:-:1 IADD txa2, txa, 2; +--:-:-:-:1 IADD txa3, txa, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P4; + +--:-:1:-:1 @P0 LDG.E.CI load0A0, [track0A + 4x<0>]; +--:-:1:-:1 @P1 LDG.E.CI load0A1, [track0A + 4x<1>]; +--:-:1:-:1 @P2 LDG.E.CI load0A2, [track0A + 4x<2>]; +--:-:1:-:1 @P3 LDG.E.CI load0A3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:2:-:1 @P0 LDG.E.CI load1A0, [track1A + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI load1A1, [track1A + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI load1A2, [track1A + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI load1A3, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P6; + +--:-:3:-:1 @P0 LDG.E.CI load2A0, [track2A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E.CI load2A1, [track2A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E.CI load2A2, [track2A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E.CI load2A3, [track2A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:4:-:1 @P0 LDG.E.CI load3A0, [track3A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI load3A1, [track3A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI load3A2, [track3A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI load3A3, [track3A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:5:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:5:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:5:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:5:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 15; + + + +21:-:-:-:1 STS.128 [writeAs + 4x<0*128>], load0A; +--:-:-:-:6 IADD track0A0.CC, track0A0, lda16; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS.128 [writeAs + 4x<4*128>], load1A; +--:-:-:-:6 IADD track1A0.CC, track1A0, lda16; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS.128 [writeAs + 4x<8*128>], load2A; +--:-:-:-:6 IADD track2A0.CC, track2A0, lda16; +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS.128 [writeAs + 4x<12*128>], load3A; +--:-:-:-:6 IADD track3A0.CC, track3A0, lda16; +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P5 LDG.E.CI.128 load0A, [track0A]; +--:-:4:-:1 @P5 LDG.E.CI.128 load1A, [track1A]; +--:-:5:-:1 @P5 LDG.E.CI.128 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.CI.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.CI.128 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P5 LDG.E.CI load0A0, [track0A + 4x<0>]; +--:-:3:-:1 @P5 LDG.E.CI load0A1, [track0A + 4x<1>]; +--:-:3:-:1 @P5 LDG.E.CI load0A2, [track0A + 4x<2>]; +--:-:3:-:1 @P5 LDG.E.CI load0A3, [track0A + 4x<3>]; + +--:-:4:-:1 @P5 LDG.E.CI load1A0, [track1A + 4x<0>]; +--:-:4:-:1 @P5 LDG.E.CI load1A1, [track1A + 4x<1>]; +--:-:4:-:1 @P5 LDG.E.CI load1A2, [track1A + 4x<2>]; +--:-:4:-:1 @P5 LDG.E.CI load1A3, [track1A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load2A0, [track2A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load2A1, [track2A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load2A2, [track2A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load2A3, [track2A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E.CI load3A0, [track3A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E.CI load3A1, [track3A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E.CI load3A2, [track3A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E.CI load3A3, [track3A + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P6 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P6 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P6 LDG.E.CI loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 0; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n", + j9c6 => "--:5:-:-:1 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n", + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, lda16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, lda16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2A0.CC, track2A0, lda16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, lda16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.CI.128 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.CI.128 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI.128 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.CI.128 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.CI.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E.CI load2A0, [track2A + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E.CI load2A1, [track2A + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E.CI load2A2, [track2A + 4x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E.CI load2A3, [track2A + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_tn_128x64.sass b/Kernel/SGEMM/Pascal/sgemm_tn_128x64.sass new file mode 100644 index 0000000..74f13cc --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_tn_128x64.sass @@ -0,0 +1,326 @@ +# Kernel: sgemm_tn_128x64 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*8*2 + 64*8*2 + 0> + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_alpha : c[0x0][0x158] + param_beta : c[0x0][0x15c] + param_flags : c[0x0][0x160] + param_lda8 : c[0x0][0x164] + param_ldb8 : c[0x0][0x168] + param_ldc : c[0x0][0x16c] + param_m : c[0x0][0x170] + param_n : c[0x0][0x174] + param_k : c[0x0][0x178] + param_ldaz : c[0x0][0x17c] + param_ldbz : c[0x0][0x180] + param_ldcz : c[0x0][0x184] + param_loops : c[0x0][0x188] + + + + + 64-95 ~ lda, ldb, ldaz, ldbz, tid1, ta, tb, tid7, tid15, tidX, blk, txa64, xmad_tb, tid, blkA, blkB, blkZ + + 0-63 : czero<00-63> + + 3, 2,11,10,19,18,27,26 : cx<0-7>y0 + 7, 6,15,14,23,22,31,30 : cx<0-7>y1 + 1, 0, 9, 8,17,16,25,24 : cx<0-7>y2 + 5, 4,13,12,21,20,29,28 : cx<0-7>y3 + 35,34,43,42,51,50,59,58 : cx<0-7>y4 + 39,38,47,46,55,54,63,62 : cx<0-7>y5 + 33,32,41,40,49,48,57,56 : cx<0-7>y6 + 37,36,45,44,53,52,61,60 : cx<0-7>y7 + + 64-95 ~ x<1-3>, x<65-67>, y<1-3> + + 64-79 : j0Ay<0-7>, j0Bx<0-7> + 80-95 : j1Ay<0-7>, j1Bx<0-7> + + 96-107 : loadA<0-7>, loadB<0-3> + 108-111 : trackA<0-1>, trackB<0-1> + + 112-125 ~ writeAs, writeBs, k, tidY, txa, txb, swapBuf + 126-127 ~ readAs, readBs + + 64-75 : c<0-7>, d3, d2, d1, d0 + 76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 86-125 ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; +--:-:4:-:1 S2R blkZ, SR_CTAID.X; + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; + + +--:-:-:-:1 LOP.AND tid1, tid, 1; +01:-:-:-:1 LOP.AND tid15, tid, 15; + +// tidX = (tid & 15) << 2 +// tidY = (tid >> 4) & 7 +--:-:-:-:1 SHL tidX, tid15, 2; +--:-:-:-:1 BFE.U32 tidY, tid, 0x304; // 3 bits at position 4 + +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 5; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; + + +// trackA += (blkA*128 + lda*tidY + tidX + ldaz*blkZ) * 4 +02:-:-:-:1 ISCADD txa, blkA, tidX, 7; +--:-:-:-:1 XMAD.LO2 ta, lda, tidY, txa; +08:-:-:-:1 XMAD.LO2 ta, ldaz, blkZ, ta; +--:-:-:-:1 LEA trackA0.CC, ta, param_A[0], 0x2; +--:-:-:-:1 LEA.HI.X trackA1, ta, param_A[1], RZ, 0x2; + +--:-:-:-:1 IADD txa64, txa, 64; +--:-:-:-:1 ISETP.LT.AND P4, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P5, PT, txa64, param_m, PT; + +// trackB += (blkB*64 + tidX + ldb*tidY + ldbz*blkZ) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidX, 6; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, blkZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, param_B[0], 0x2; +--:-:-:-:1 LEA.HI.X trackB1, tb, param_B[1], RZ, 0x2; + +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +// Start the write buffers high +// writeAs = (128*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeAs, tidY, tidX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2; +// writeBs = (64*tidY + tidX) * 4 +--:-:-:-:1 ISCADD writeBs, tidY, tidX, 6; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2; + +// Start the read buffers low +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x<128*8>, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x<64*8 + 128*8>; + + + +REMAINDER: + + + + + our $vec; + return $vec ? q{ + +// doLoad = tidY < k && txa|txb < n|m +--:-:-:-:1 ISETP.LT.AND P1, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidY, k, P6; + + +--:-:2:-:1 @P1 LDG.E.CI.128 loadA0, [trackA + 4x< 0>]; +--:-:3:-:1 @P2 LDG.E.CI.128 loadA4, [trackA + 4x<64>]; +--:-:4:-:1 @P3 LDG.E.CI.128 loadB0, [trackB]; + +--:-:5:-:2 @!P1 LDS.U.128 loadA0, [addr_zero]; +--:-:5:-:2 @!P2 LDS.U.128 loadA4, [addr_zero]; +--:-:6:-:2 @!P3 LDS.U.128 loadB0, [addr_zero]; + + +// bDoRemainder = k & 7 && k > 8 +--:-:-:-:1 LOP.AND.NZ P1, RZ, k, 7; + + // Vec 4 and scalar loads + } : q{ + +// doLoadA = tidY < k && txa < m +// doLoadB = tidY < k && txb < n +--:-:-:-:1 IADD x1, txa, 1; +--:-:-:-:1 IADD x2, txa, 2; +--:-:-:-:1 IADD x3, txa, 3; +--:-:-:-:1 IADD x65, txa, 65; +--:-:-:-:1 IADD x66, txa, 66; +--:-:-:-:1 IADD x67, txa, 67; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, x1, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x2, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x3, param_m, P0; + +--:-:2:-:1 @P0 LDG.E.CI loadA0, [trackA + 4x<0>]; +--:-:2:-:1 @P1 LDG.E.CI loadA1, [trackA + 4x<1>]; +--:-:2:-:1 @P2 LDG.E.CI loadA2, [trackA + 4x<2>]; +--:-:2:-:1 @P3 LDG.E.CI loadA3, [trackA + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadA0, RZ; +--:-:-:-:1 @!P1 MOV loadA1, RZ; +--:-:-:-:1 @!P2 MOV loadA2, RZ; +--:-:-:-:1 @!P3 MOV loadA3, RZ; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, x65, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, x66, param_m, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, x67, param_m, P0; + +--:-:3:-:1 @P0 LDG.E.CI loadA4, [trackA + 4x<64>]; +--:-:3:-:1 @P1 LDG.E.CI loadA5, [trackA + 4x<65>]; +--:-:3:-:1 @P2 LDG.E.CI loadA6, [trackA + 4x<66>]; +--:-:3:-:1 @P3 LDG.E.CI loadA7, [trackA + 4x<67>]; + +--:-:-:-:1 @!P0 MOV loadA4, RZ; +--:-:-:-:1 @!P1 MOV loadA5, RZ; +--:-:-:-:1 @!P2 MOV loadA6, RZ; +--:-:-:-:1 @!P3 MOV loadA7, RZ; + +--:-:-:-:1 IADD y1, txb, 1; +--:-:-:-:1 IADD y2, txb, 2; +--:-:-:-:1 IADD y3, txb, 3; +--:-:-:-:1 ISETP.LT.AND P0, PT, tidY, k, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, y1, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P2, PT, y2, param_n, P0; +--:-:-:-:1 ISETP.LT.AND P3, PT, y3, param_n, P0; + +--:-:4:-:1 @P0 LDG.E.CI loadB0, [trackB + 4x<0>]; +--:-:4:-:1 @P1 LDG.E.CI loadB1, [trackB + 4x<1>]; +--:-:4:-:1 @P2 LDG.E.CI loadB2, [trackB + 4x<2>]; +--:-:4:-:1 @P3 LDG.E.CI loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + + }; + + + + +12:-:-:-:1 STS.128 [writeAs + 4x< 0>], loadA0; +04:-:-:-:1 STS.128 [writeAs + 4x<64>], loadA4; + +--:-:-:-:6 IADD trackA0.CC, trackA0, param_lda8; +--:-:-:-:0 IADD.X trackA1, trackA1, RZ; + +28:-:-:-:1 STS.128 [writeBs], loadB0; + +--:-:-:-:6 IADD trackB0.CC, trackB0, param_ldb8; +--:-:-:-:1 IADD.X trackB1, trackB1, RZ; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + + + our $vec; + return $vec ? q{ +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, P1; + } : q{ +--:-:-:-:0 ISETP.GT.AND P1, PT, k, 8, PT; + }; + + + + our $vec; + my $k_end = $vec ? 16 : 24; + our @top = ("--:-:-:-:1 ISETP.GE.AND P2, PT, k, $k_end, P4;\n"); + + our %insert = + ( + j0c1 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, $k_end, P5;\n", + j0c3 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, P6;\n", + + ($vec ? + ( + j0c13 => "--:-:2:-:1 \@P2 LDG.E.CI.128 loadA0, [trackA + 4x< 0>];\n", + j0c15 => "--:-:3:-:1 \@P3 LDG.E.CI.128 loadA4, [trackA + 4x<64>];\n", + j0c33 => "--:-:4:-:1 \@P0 LDG.E.CI.128 loadB0, [trackB];\n", + ) : + ( + j0c10 => "--:-:2:-:1 \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n", + j0c12 => "--:-:2:-:1 \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n", + j0c14 => "--:-:2:-:1 \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n", + j0c16 => "--:-:2:-:1 \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n", + + j0c29 => "--:-:3:-:1 \@P3 LDG.E.CI loadA4, [trackA + 4x<64>];\n", + j0c31 => "--:-:3:-:1 \@P3 LDG.E.CI loadA5, [trackA + 4x<65>];\n", + j0c33 => "--:-:3:-:1 \@P3 LDG.E.CI loadA6, [trackA + 4x<66>];\n", + j0c35 => "--:-:3:-:1 \@P3 LDG.E.CI loadA7, [trackA + 4x<67>];\n", + + j1c29 => "--:-:4:-:1 \@P0 LDG.E.CI loadB0, [trackB + 4x<0>];\n", + j1c31 => "--:-:4:-:1 \@P0 LDG.E.CI loadB1, [trackB + 4x<1>];\n", + j1c33 => "--:-:4:-:1 \@P0 LDG.E.CI loadB2, [trackB + 4x<2>];\n", + j1c35 => "--:-:4:-:1 \@P0 LDG.E.CI loadB3, [trackB + 4x<3>];\n", + ) + ), + + j1c37 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, $k_end, PT;\n", + + j1c39 => "--:-:-:-:1 IADD32I k, k, -8;\n", + + j5c31 => "02:-:-:-:1 \@P0 STS.128 [writeAs + 4x< 0>], loadA0;\n", + j5c33 => "04:-:-:-:1 \@P0 STS.128 [writeAs + 4x<64>], loadA4;\n", + + j5c46 => "--:-:-:-:1 \@P0 IADD trackA0.CC, trackA0, param_lda8;\n", + j5c54 => "--:-:-:-:1 \@P0 IADD.X trackA1, trackA1, RZ;\n", + + j6c39 => "08:-:-:-:1 \@P0 STS.128 [writeBs], loadB0;\n", + + j6c46 => "--:-:-:-:1 \@P0 IADD trackB0.CC, trackB0, param_ldb8;\n", + j6c54 => "--:-:-:-:1 \@P0 IADD.X trackB1, trackB1, RZ;\n", + + j6c63 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + j7c63 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n", + ); + return; + + + diff --git a/Kernel/SGEMM/Pascal/sgemm_tn_rnn_bprop_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_tn_rnn_bprop_128x32.sass new file mode 100644 index 0000000..3db4612 --- /dev/null +++ b/Kernel/SGEMM/Pascal/sgemm_tn_rnn_bprop_128x32.sass @@ -0,0 +1,476 @@ +# Kernel: sgemm_tn_128x32 + +# Copyright 2014 Nervana Systems Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + addr_zero : 4x<128*16*2 + 32*16*2> + szShareA : 128*16 + szShareB : 32*16 + + gridDimA : c[0x0][0x14] + gridDimB : c[0x0][0x18] + + param_C[0] : c[0x0][0x140] + param_C[1] : c[0x0][0x144] + param_A[0] : c[0x0][0x148] + param_A[1] : c[0x0][0x14c] + param_B[0] : c[0x0][0x150] + param_B[1] : c[0x0][0x154] + param_H[0] : c[0x0][0x158] + param_H[1] : c[0x0][0x15c] + param_lockAddr[0] : c[0x0][0x160] + param_lockAddr[1] : c[0x0][0x164] + param_alpha : c[0x0][0x168] + param_beta : c[0x0][0x16c] + param_xcutoff : c[0x0][0x170] + param_flags : c[0x0][0x174] + param_lda8 : c[0x0][0x178] + param_ldb8 : c[0x0][0x17c] + param_ldc : c[0x0][0x180] + param_ldh : c[0x0][0x184] + param_m : c[0x0][0x188] + param_n : c[0x0][0x18c] + param_k : c[0x0][0x190] + param_ldaz : c[0x0][0x194] + param_ldbz : c[0x0][0x198] + param_ldcz : c[0x0][0x19c] + param_loops : c[0x0][0x1a0] + param_dimB : c[0x0][0x1a4] + param_dimC : c[0x0][0x1a8] + param_dimH : c[0x0][0x1ac] + param_unrolling : c[0x0][0x1b0] + param_numBlks : c[0x0][0x1b4] + param_numAblks : c[0x0][0x1b8] + + + + + 32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>, offsetB + 80-81 : baseB<0-1> + + 0-31 : czero<00-31> + + 3, 2,11,10 : cx<0-3>y0 + 7, 6,15,14 : cx<0-3>y1 + 1, 0, 9, 8 : cx<0-3>y2 + 5, 4,13,12 : cx<0-3>y3 + 19,18,27,26 : cx<0-3>y4 + 23,22,31,30 : cx<0-3>y5 + 17,16,25,24 : cx<0-3>y6 + 21,20,29,28 : cx<0-3>y7 + + 32-43 : j0Ay<0-7>, j0Bx<0-3> + 44-55 : j1Ay<0-7>, j1Bx<0-3> + 56-67 : j2Ay<0-7>, j2Bx<0-3> + 68-79 : j3Ay<0-7>, j3Bx<0-3> + + 80-83 : loadB<0-3> + 84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3> + + 100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1> + + 110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb + 121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, time_step + + 32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1> + 40-47 : c<0-3>, d3, d2, d1, d0 + 48-63 : H00y<0-1>, H04y<0-1>, H08y<0-1>, H12y<0-1>, h0, h1, h2, h3, baseC<0-1>, baseH<0-1> + 64-68 : blkId, nextBlk, lockAddr<0-1>, lockVal + 69-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags, xcutoff, offsetC, offsetH, numBlk, predSave, ldh1, ldh4, ldh60 + + + +--:-:1:-:1 S2R tid, SR_TID.X; +--:-:2:-:1 S2R blkA, SR_CTAID.Y; +--:-:3:-:1 S2R blkB, SR_CTAID.Z; + +--:-:-:-:1 MOV time_step, RZ; +--:-:-:-:1 MOV flags, param_flags; + +RNN_LOOP: + + +--:-:-:-:1 MOV k, param_k; +--:-:-:-:1 MOV lda, param_lda8; +--:-:-:-:1 MOV ldb, param_ldb8; +--:-:-:-:1 SHR.U32 lda, lda, 5; +--:-:-:-:1 SHR.U32 ldb, ldb, 5; +--:-:-:-:1 MOV ldaz, param_ldaz; +--:-:-:-:1 MOV ldbz, param_ldbz; +--:-:-:-:1 SHL lda16, lda, 6; +--:-:-:-:1 SHL ldb16, ldb, 6; +--:-:-:-:1 SHL lda4, lda, 2; + +--:-:-:-:1 STS.128 [addr_zero], RZ; + + return join '', map sprintf("--:-:-:-:1 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; + + +--:-:-:-:6 LOP.AND.NZ P0, RZ, flags, 4; +--:-:-:-:6 @P0 IADD offsetB, -time_step, param_unrolling; +--:-:-:-:6 @P0 IADD offsetB, offsetB, -1; +--:-:-:-:6 @!P0 MOV offsetB, time_step; + +// baseB = param_B + dimB * time_step +--:-:-:-:1 XMAD offsetB, offsetB, param_dimB, RZ; +--:-:-:-:1 LEA baseB0.CC, offsetB, param_B[0], 2; +--:-:-:-:1 LEA.HI.X baseB1, offsetB, param_B[1], RZ, 2; + +// tidAX = (tid & 31) << 2 +// tidAY = (tid >> 5) +01:-:-:-:1 LOP.AND tidAX, tid, 31; +--:-:-:-:1 SHL tidAX, tidAX, 2; +--:-:-:-:1 SHR.U32 tidAY, tid, 5; + +// tidBX = (tid & 7) << 2 +// tidBY = (tid >> 3) +01:-:-:-:1 LOP.AND tidBX, tid, 7; +--:-:-:-:1 SHL tidBX, tidBX, 2; +--:-:-:-:1 SHR.U32 tidBY, tid, 3; + +// trackA += (blkA*128 + tidAX + lda*tidAY) * 4 +02:-:-:-:1 ISCADD txa, blkA, tidAX, 7; +--:-:-:-:1 XMAD.LO2 ta0, lda, tidAY, txa; +08:-:-:-:1 XMAD.LO2 ta0, ldaz, RZ, ta0; +--:-:-:-:1 IADD ta1, ta0, lda4; +--:-:-:-:1 IADD ta2, ta1, lda4; +--:-:-:-:1 IADD ta3, ta2, lda4; + +--:-:-:-:1 LEA track0A0.CC, ta0, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track0A1, ta0, param_A[1], RZ, 2; +--:-:-:-:1 LEA track1A0.CC, ta1, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track1A1, ta1, param_A[1], RZ, 2; +--:-:-:-:1 LEA track2A0.CC, ta2, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track2A1, ta2, param_A[1], RZ, 2; +--:-:-:-:1 LEA track3A0.CC, ta3, param_A[0], 2; +--:-:-:-:1 LEA.HI.X track3A1, ta3, param_A[1], RZ, 2; + +// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4 +04:-:-:-:1 ISCADD txb, blkB, tidBX, 5; +--:-:-:-:1 XMAD.LO2 tb, ldb, tidBY, txb; +08:-:-:-:1 XMAD.LO2 tb, ldbz, RZ, tb; +--:-:-:-:1 LEA trackB0.CC, tb, baseB0, 2; +--:-:-:-:1 LEA.HI.X trackB1, tb, baseB1, RZ, 2; + +// writeAs = (tidAY*128 + tidAX) * 4 +--:-:-:-:1 ISCADD writeAs, tidAY, tidAX, 7; +--:-:-:-:1 ISCADD writeAs, writeAs, 4x, 2; + +// writeBs = (tidBY*32 + tidBX) * 4 +--:-:-:-:1 ISCADD writeBs, tidBY, tidBX, 5; +--:-:-:-:1 ISCADD writeBs, writeBs, 4x, 2; + +// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4; +--:-:-:-:1 LOP.AND tid1, tid, 1; +--:-:-:-:1 LOP.AND readAs, tid, 0x70; +--:-:-:-:1 SHR.U32 readAs, readAs, 3; +--:-:-:-:1 LOP.OR readAs, readAs, tid1; +--:-:-:-:1 SHL readAs, readAs, 4; +// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>; +--:-:-:-:1 BFE.U32 readBs, tid, 0x301; // 3 bits at position 1 +--:-:-:-:1 ISCADD readBs, readBs, 4x, 4; + +--:-:-:-:1 MOV32I swapBuf, -4x; + + +REMAINDER: + + + +--:-:-:-:1 IADD tidAY1, tidAY, 4; +--:-:-:-:1 IADD tidAY2, tidAY, 8; +--:-:-:-:1 IADD tidAY3, tidAY, 12; + + + our $vec; + return $vec ? q{ +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + +--:-:-:-:1 ISETP.LT.AND P0, PT, tidAY, k, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, tidAY1, k, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, tidAY2, k, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, tidAY3, k, P5; +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, P6; + + +--:-:1:-:1 @P0 LDG.E.128 load0A, [track0A]; +--:-:2:-:1 @P1 LDG.E.128 load1A, [track1A]; +--:-:3:-:1 @P2 LDG.E.128 load2A, [track2A]; +--:-:4:-:1 @P3 LDG.E.128 load3A, [track3A]; +--:-:5:-:1 @P4 LDG.E.128 loadB, [trackB]; + + + +--:-:6:-:1 @!P0 LDS.U.128 load0A, [addr_zero]; +--:-:6:-:1 @!P1 LDS.U.128 load1A, [addr_zero]; +--:-:6:-:1 @!P2 LDS.U.128 load2A, [addr_zero]; +--:-:6:-:1 @!P3 LDS.U.128 load3A, [addr_zero]; +--:-:6:-:2 @!P4 LDS.U.128 loadB, [addr_zero]; + + + } : q{ + +--:-:-:-:1 IADD txa1, txa, 1; +--:-:-:-:1 IADD txa2, txa, 2; +--:-:-:-:1 IADD txa3, txa, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidAY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P4; + +--:-:1:-:1 @P0 LDG.E load0A0, [track0A + 4x<0>]; +--:-:1:-:1 @P1 LDG.E load0A1, [track0A + 4x<1>]; +--:-:1:-:1 @P2 LDG.E load0A2, [track0A + 4x<2>]; +--:-:1:-:1 @P3 LDG.E load0A3, [track0A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load0A0, RZ; +--:-:-:-:1 @!P1 MOV load0A1, RZ; +--:-:-:-:1 @!P2 MOV load0A2, RZ; +--:-:-:-:1 @!P3 MOV load0A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY1, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:2:-:1 @P0 LDG.E load1A0, [track1A + 4x<0>]; +--:-:2:-:1 @P1 LDG.E load1A1, [track1A + 4x<1>]; +--:-:2:-:1 @P2 LDG.E load1A2, [track1A + 4x<2>]; +--:-:2:-:1 @P3 LDG.E load1A3, [track1A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load1A0, RZ; +--:-:-:-:1 @!P1 MOV load1A1, RZ; +--:-:-:-:1 @!P2 MOV load1A2, RZ; +--:-:-:-:1 @!P3 MOV load1A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P6, PT, tidAY2, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P6; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P6; + +--:-:3:-:1 @P0 LDG.E load2A0, [track2A + 4x<0>]; +--:-:3:-:1 @P1 LDG.E load2A1, [track2A + 4x<1>]; +--:-:3:-:1 @P2 LDG.E load2A2, [track2A + 4x<2>]; +--:-:3:-:1 @P3 LDG.E load2A3, [track2A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load2A0, RZ; +--:-:-:-:1 @!P1 MOV load2A1, RZ; +--:-:-:-:1 @!P2 MOV load2A2, RZ; +--:-:-:-:1 @!P3 MOV load2A3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, tidAY3, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txa, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P1, PT, txa1, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P2, PT, txa2, param_m, P5; +--:-:-:-:1 ISETP.LT.AND P3, PT, txa3, param_m, P5; + +--:-:4:-:1 @P0 LDG.E load3A0, [track3A + 4x<0>]; +--:-:4:-:1 @P1 LDG.E load3A1, [track3A + 4x<1>]; +--:-:4:-:1 @P2 LDG.E load3A2, [track3A + 4x<2>]; +--:-:4:-:1 @P3 LDG.E load3A3, [track3A + 4x<3>]; + +--:-:-:-:1 @!P0 MOV load3A0, RZ; +--:-:-:-:1 @!P1 MOV load3A1, RZ; +--:-:-:-:1 @!P2 MOV load3A2, RZ; +--:-:-:-:1 @!P3 MOV load3A3, RZ; + +--:-:-:-:1 IADD txb1, txb, 1; +--:-:-:-:1 IADD txb2, txb, 2; +--:-:-:-:1 IADD txb3, txb, 3; + +--:-:-:-:1 ISETP.LT.AND P4, PT, tidBY, k, PT; +--:-:-:-:1 ISETP.LT.AND P0, PT, txb, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P1, PT, txb1, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P2, PT, txb2, param_n, P4; +--:-:-:-:1 ISETP.LT.AND P3, PT, txb3, param_n, P4; + +--:-:5:-:1 @P0 LDG.E loadB0, [trackB + 4x<0>]; +--:-:5:-:1 @P1 LDG.E loadB1, [trackB + 4x<1>]; +--:-:5:-:1 @P2 LDG.E loadB2, [trackB + 4x<2>]; +--:-:5:-:1 @P3 LDG.E loadB3, [trackB + 4x<3>]; + +--:-:-:-:1 @!P0 MOV loadB0, RZ; +--:-:-:-:1 @!P1 MOV loadB1, RZ; +--:-:-:-:1 @!P2 MOV loadB2, RZ; +--:-:-:-:1 @!P3 MOV loadB3, RZ; + +--:-:-:-:1 ISETP.LT.AND P5, PT, txa, param_m, PT; +--:-:-:-:1 ISETP.LT.AND P6, PT, txb, param_n, PT; + }; + + +--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5; +--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6; + +// bDoRemainder = k & 15 && k > 16 +--:-:-:-:0 LOP.AND.NZ P1, RZ, k, 15; + + + +21:-:-:-:1 STS.128 [writeAs + 4x<0*128>], load0A; +--:-:-:-:6 IADD track0A0.CC, track0A0, lda16; +--:-:-:-:0 IADD.X track0A1, track0A1, RZ; + +02:-:-:-:1 STS.128 [writeAs + 4x<4*128>], load1A; +--:-:-:-:6 IADD track1A0.CC, track1A0, lda16; +--:-:-:-:0 IADD.X track1A1, track1A1, RZ; + +04:-:-:-:1 STS.128 [writeAs + 4x<8*128>], load2A; +--:-:-:-:6 IADD track2A0.CC, track2A0, lda16; +--:-:-:-:0 IADD.X track2A1, track2A1, RZ; + +08:-:-:-:1 STS.128 [writeAs + 4x<12*128>], load3A; +--:-:-:-:6 IADD track3A0.CC, track3A0, lda16; +--:-:-:-:0 IADD.X track3A1, track3A1, RZ; + +10:-:-:-:1 STS.128 [writeBs], loadB; +--:-:-:-:1 IADD trackB0.CC, trackB0, ldb16; + +--:-:-:-:1 ISETP.GT.AND P1, PT, k, 16, P1; + +--:-:-:-:1 IADD readBs, readBs, -swapBuf; +--:-:-:-:0 IADD readAs, readAs, -swapBuf; +--:-:-:-:5 BAR.SYNC 0; +--:-:-:-:1 IADD writeBs, writeBs, swapBuf; +--:-:-:-:1 IADD writeAs, writeAs, swapBuf; +--:-:-:-:1 IADD swapBuf, RZ, -swapBuf; + +--:-:-:-:0 IADD.X trackB1, trackB1, RZ; + + + our $vec; + return $vec ? q{ +--:-:3:-:1 @P5 LDG.E.128 load0A, [track0A]; +--:-:4:-:1 @P5 LDG.E.128 load1A, [track1A]; +--:-:5:-:1 @P5 LDG.E.128 load2A, [track2A]; +--:-:5:-:1 @P5 LDG.E.128 load3A, [track3A]; +--:-:6:-:1 @P6 LDG.E.128 loadB, [trackB]; + } : q{ +--:-:3:-:1 @P5 LDG.E load0A0, [track0A + 4x<0>]; +--:-:3:-:1 @P5 LDG.E load0A1, [track0A + 4x<1>]; +--:-:3:-:1 @P5 LDG.E load0A2, [track0A + 4x<2>]; +--:-:3:-:1 @P5 LDG.E load0A3, [track0A + 4x<3>]; + +--:-:4:-:1 @P5 LDG.E load1A0, [track1A + 4x<0>]; +--:-:4:-:1 @P5 LDG.E load1A1, [track1A + 4x<1>]; +--:-:4:-:1 @P5 LDG.E load1A2, [track1A + 4x<2>]; +--:-:4:-:1 @P5 LDG.E load1A3, [track1A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E load2A0, [track2A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E load2A1, [track2A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E load2A2, [track2A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E load2A3, [track2A + 4x<3>]; + +--:-:5:-:1 @P5 LDG.E load3A0, [track3A + 4x<0>]; +--:-:5:-:1 @P5 LDG.E load3A1, [track3A + 4x<1>]; +--:-:5:-:1 @P5 LDG.E load3A2, [track3A + 4x<2>]; +--:-:5:-:1 @P5 LDG.E load3A3, [track3A + 4x<3>]; + +--:-:6:-:1 @P6 LDG.E loadB0, [trackB + 4x<0>]; +--:-:6:-:1 @P6 LDG.E loadB1, [trackB + 4x<1>]; +--:-:6:-:1 @P6 LDG.E loadB2, [trackB + 4x<2>]; +--:-:6:-:1 @P6 LDG.E loadB3, [trackB + 4x<3>]; + }; + + + + our $vec; + our $shiftAX = 0; + our $shiftBX = 0; + our %insert = + ( + j0c6 => "--:-:-:-:1 IADD k, k, -16;\n", + j0c14 => "--:-:-:-:1 ISETP.GE.AND P0, PT, k, 16, PT;\n", + + j3c6 => "04:3:-:-:1 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n", + j5c6 => "08:4:-:-:1 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n", + j7c6 => "10:-:-:-:1 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n", + j9c6 => "--:5:-:-:1 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n", + j11c6 => "20:6:-:-:1 \@P0 STS.128 [writeBs], loadB;\n", + + j3c7 => "--:-:-:-:1 \@P2 IADD track0A0.CC, track0A0, lda16;\n", + j3c13 => "--:-:-:-:1 \@P2 IADD.X track0A1, track0A1, RZ;\n", + j5c7 => "--:-:-:-:1 \@P3 IADD track1A0.CC, track1A0, lda16;\n", + j5c13 => "--:-:-:-:1 \@P3 IADD.X track1A1, track1A1, RZ;\n", + j7c7 => "--:-:-:-:1 \@P5 IADD track2A0.CC, track2A0, lda16;\n", + j7c13 => "--:-:-:-:1 \@P5 IADD.X track2A1, track2A1, RZ;\n", + j9c7 => "--:-:-:-:1 \@P5 IADD track3A0.CC, track3A0, lda16;\n", + j9c13 => "--:-:-:-:1 \@P5 IADD.X track3A1, track3A1, RZ;\n", + j11c7 => "--:-:-:-:1 \@P6 IADD trackB0.CC, trackB0, ldb16;\n", + j11c13 => "--:-:-:-:1 \@P6 IADD.X trackB1, trackB1, RZ;\n", + + j3c14 => "--:-:-:-:1 ISETP.GE.AND P2, PT, k, 32, P2;\n", + j5c14 => "--:-:-:-:1 ISETP.GE.AND P3, PT, k, 32, P3;\n", + j9c14 => "--:-:-:-:1 ISETP.GE.AND P5, PT, k, 32, P5;\n", + j11c14 => "--:-:-:-:1 ISETP.GE.AND P6, PT, k, 32, P6;\n", + + j13c31 => "--:-:-:-:5 \@P0 BAR.SYNC 0;\n" . + "--:-:-:-:1 \@P0 IADD readAs, readAs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD readBs, readBs, -swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeAs, writeAs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD writeBs, writeBs, swapBuf;\n" . + "--:-:-:-:1 \@P0 IADD swapBuf, RZ, -swapBuf;\n", + + ($vec ? + ( + j3c29 => "04:-:3:-:1 \@P2 LDG.E.128 load0A, [track0A];\n", + j5c29 => "08:-:4:-:1 \@P3 LDG.E.128 load1A, [track1A];\n", + j9c29 => "10:-:-:-:1 \@P5 LDG.E.128 load2A, [track2A];\n", + j9c31 => "--:-:5:-:1 \@P5 LDG.E.128 load3A, [track3A];\n", + j11c29 => "20:-:6:-:1 \@P6 LDG.E.128 loadB, [trackB];\n", + ) : + ( + j3c29 => "04:-:-:-:1 \@P2 LDG.E load0A0, [track0A + 4x<0>];\n", + j3c31 => "--:-:-:-:1 \@P2 LDG.E load0A1, [track0A + 4x<1>];\n", + j4c1 => "--:-:-:-:1 \@P2 LDG.E load0A2, [track0A + 4x<2>];\n", + j4c3 => "--:-:3:-:1 \@P2 LDG.E load0A3, [track0A + 4x<3>];\n", + + j5c29 => "08:-:-:-:1 \@P3 LDG.E load1A0, [track1A + 4x<0>];\n", + j5c31 => "--:-:-:-:1 \@P3 LDG.E load1A1, [track1A + 4x<1>];\n", + j6c1 => "--:-:-:-:1 \@P3 LDG.E load1A2, [track1A + 4x<2>];\n", + j6c3 => "--:-:4:-:1 \@P3 LDG.E load1A3, [track1A + 4x<3>];\n", + + j9c29 => "10:-:-:-:1 \@P5 LDG.E load2A0, [track2A + 4x<0>];\n", + j9c31 => "--:-:-:-:1 \@P5 LDG.E load2A1, [track2A + 4x<1>];\n", + j10c1 => "--:-:-:-:1 \@P5 LDG.E load2A2, [track2A + 4x<2>];\n", + j10c3 => "--:-:-:-:1 \@P5 LDG.E load2A3, [track2A + 4x<3>];\n", + + j10c8 => "--:-:-:-:1 \@P5 LDG.E load3A0, [track3A + 4x<0>];\n", + j10c10 => "--:-:-:-:1 \@P5 LDG.E load3A1, [track3A + 4x<1>];\n", + j10c12 => "--:-:-:-:1 \@P5 LDG.E load3A2, [track3A + 4x<2>];\n", + j10c14 => "--:-:5:-:1 \@P5 LDG.E load3A3, [track3A + 4x<3>];\n", + + j11c29 => "20:-:-:-:1 \@P6 LDG.E loadB0, [trackB + 4x<0>];\n", + j11c31 => "--:-:-:-:1 \@P6 LDG.E loadB1, [trackB + 4x<1>];\n", + j12c1 => "--:-:-:-:1 \@P6 LDG.E loadB2, [trackB + 4x<2>];\n", + j12c3 => "--:-:6:-:1 \@P6 LDG.E loadB3, [trackB + 4x<3>];\n", + ) + ), + + j15c31 => "--:-:-:Y:5 \@P0 BRA.U LOOP;\n" . + "--:-:-:Y:5 \@P1 BRA.U REMAINDER;\n" + ); + return ''; + + + diff --git a/README.md b/README.md new file mode 100644 index 0000000..34711e0 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# DeepPerf + +DeepPerf is developed to understand GPU microarchitectural features and improve performance for compute-intensive kernels. The methodology relies on a reverse engineering approach to crack the GPU ISA encodings in order to build a GPU assembler. An assembly microbenchmark suite correlates microarchitectural features with their performance factors to uncover instruction-level and memory hierarchy preferences. +We use SGEMM and Convolution as examples to show the ways to achieve bare-metal performance tuning. In your deep learning framework, you could use directly these sass code to speed up the performance. + +The toolchain is an attempt to automatically crack different GPU ISA encodings and build an assembler adaptively for the purpose of performance enhancements to applications on GPUs. +There are three directories in this folder, which consists of three major steps to optimize a cuda code in the assembly level. All the tools cover three recent NVIDIA GPU architecture, Kepler, Maxwell and Pascal. + + diff --git a/Solver/.gitignore b/Solver/.gitignore new file mode 100644 index 0000000..1a8323b --- /dev/null +++ b/Solver/.gitignore @@ -0,0 +1,2 @@ +data/* +output/* diff --git a/Solver/README.md b/Solver/README.md new file mode 100644 index 0000000..53a0f21 --- /dev/null +++ b/Solver/README.md @@ -0,0 +1,32 @@ + +# Cracking GPU ISA Encodings + +## Output + +* Bit positions of opcodes +* Bit positions of operands for different operand type +* Bit positions of modifiers for each instruction + +## How to run the workflow? + +The workflow is composed of four stages: + +1. Generate PTX code->`./bin/generate_disassemble [arch]` + * Generate PTX code (.ptx) in ptxgen directory and compile PTX to cubin; + * Disassemble cubins to sass files, which feed into the following three solvers; + * Each line of sass files looks like this: + + `/∗0048∗/ IADD R0, R2, R0; /∗0x4800000000201c03∗/` + +2. Opcode solver->`./bin/opcode [arch]` + * Probe 64-bit binary code of sass files by flipping each bit and observe whether opcodes change; + +3. Modifer solver->`./bin/modifier [arch]` + * Probe 64-bit binary code of sass files by flipping each bit and observe whether modifiers change; + * Enuermerate bits on all modifier positions to generate all the modifiers; + +4. Operand solver->`./bin/operand [arch]` + * Probe 64-bit binary code of sass files by flipping each bit and observe whether operands change; + * Operand type: R: Register, S: Special Register, I: Immediate, C: constant[][], M: Memory, P: Predicate; + +5. Allowed values for `[arch]` options: 'sm_30','sm_32','sm_35','sm_37','sm_50','sm_52','sm_53','sm_60','sm_61','sm_62'. diff --git a/Solver/bin/generate_disassemble b/Solver/bin/generate_disassemble new file mode 100755 index 0000000..e37b189 --- /dev/null +++ b/Solver/bin/generate_disassemble @@ -0,0 +1,82 @@ +#!/bin/bash + +# Please input architecture parameter argv[1] +if [ "$#" -lt 1 ] +then + echo "Please input architecture parameter argv[1]" + exit -1 +fi + +arch=$1 +prefix="data/"$arch"/" +src_directory="src/" + +echo "Arch: "$arch +echo "Data directory: "$prefix + +# 1. Generate ptx +ptx_directory=$prefix"ptx/" +mkdir -p $ptx_directory +echo ".................................................................." +echo "1. Generate .ptx files to "$ptx_directory" directory" +echo "It may take serveral miniutes" +echo ".................................................................." +perl $src_directory"ptxgen.pl" $arch $ptx_directory + +# 2. Compile to cubins +cubin_directory=$prefix"cubin/" +mkdir -p $cubin_directory +echo ".................................................................." +echo "2. Compile .ptx file to cubin files in "$cubin_directory" directory" +echo "It may take serveral miniutes" +echo ".................................................................." +ptx=$ptx_directory"*.ptx" +for p in $ptx +do + f=`echo $p | cut -d / -f 4 |cut -d . -f 1` + fout=$cubin_directory""$f".cubin" + echo $fout + ptxas -arch $arch -m 64 $p -o $fout > /dev/null 2>&1 +done + +# 3. Disassembly to sass +asm_directory=$prefix"asm/" +mkdir -p $asm_directory +echo ".................................................................." +echo "3. Disassemble .cubin file to sass files in "$asm_directory" directory" +echo "It may take serveral miniutes" +echo ".................................................................." +cubin=$cubin_directory"*.cubin" +for p in $cubin +do + f=`echo $p | cut -d / -f 4 | cut -d . -f 1` + fout=$asm_directory""$f".sass" + echo $fout + cuobjdump --gpu-architecture $arch --dump-sass $p > $fout +done + +# 4.Put all sass results in one file +echo ".................................................................." +echo "4. Gathering results from ptxgen" +echo ".................................................................." +asm=$asm_directory"*.sass" +if [ -f /tmp/all.sass ] +then + rm /tmp/all.sass +else + touch /tmp/all.sass +fi + +for f in $asm +do + cat $f >> /tmp/all.sass +done + +# Ignore non-instruction lines +awk '{if (NF >= 5) {$1 = ""; print $0} }' /tmp/all.sass > /tmp/all_inst.sass +# Make instruction uniq +python $src_directory"unique.py" /tmp/all_inst.sass > $prefix""$arch".sass" +# Generate test cubin +nvcc -cubin -arch $arch $src_directory"test.cu" -o $prefix""$arch".cubin" + +rm /tmp/all.sass /tmp/all_inst.sass diff --git a/Solver/bin/modifier b/Solver/bin/modifier new file mode 100755 index 0000000..68bdea3 --- /dev/null +++ b/Solver/bin/modifier @@ -0,0 +1,19 @@ +#!/bin/bash + +# Please input architecture parameter argv[1] +if [ "$#" -lt 1 ] +then + echo "Please input architecture parameter argv[1]" + exit -1 +fi + +arch=$1 +prefix="data/"$arch"/" +src_directory="src/" +asm_directory=$prefix +output_directory="output/"$arch"/" +output_file=$output_directory""$arch".modifier" +mkdir -p $output_directory +rm -rf $output_file || true +echo "Output file: "$output_file +python $src_directory"modifier.py" $asm_directory""$arch".sass" $arch $output_file diff --git a/Solver/bin/opcode b/Solver/bin/opcode new file mode 100755 index 0000000..7f25fb1 --- /dev/null +++ b/Solver/bin/opcode @@ -0,0 +1,19 @@ +#!/bin/bash + +# Please input architecture parameter argv[1] +if [ "$#" -lt 1 ] +then + echo "Please input architecture parameter argv[1]" + exit -1 +fi + +arch=$1 +prefix="data/"$arch"/" +src_directory="src/" +asm_directory=$prefix +output_directory="output/"$arch"/" +output_file=$output_directory""$arch".opcode" +mkdir -p $output_directory +rm -rf $output_file || true +echo "Output file: "$output_file +python $src_directory"opcode.py" $asm_directory""$arch".sass" $arch $output_file diff --git a/Solver/bin/operand b/Solver/bin/operand new file mode 100755 index 0000000..5c6d9e4 --- /dev/null +++ b/Solver/bin/operand @@ -0,0 +1,19 @@ +#!/bin/bash + +# Please input architecture parameter argv[1] +if [ "$#" -lt 1 ] +then + echo "Please input architecture parameter argv[1]" + exit -1 +fi + +arch=$1 +prefix="data/"$arch"/" +src_directory="src/" +asm_directory=$prefix +output_directory="output/"$arch"/" +output_file=$output_directory""$arch".operand" +mkdir -p $output_directory +rm -rf $output_file || true +echo "Output file: "$output_file +python $src_directory"operand.py" $asm_directory""$arch".sass" $arch $output_file diff --git a/Solver/src/__init__.py b/Solver/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Solver/src/dumper.py b/Solver/src/dumper.py new file mode 100644 index 0000000..934c012 --- /dev/null +++ b/Solver/src/dumper.py @@ -0,0 +1,27 @@ +import os +import struct + +def arch2mode(arch): + return arch.replace("_", "").upper() + +def dump(newcode, arch): + version = arch.split("_")[1] + if version < 40: + tmp_bin = "/tmp/tmp_dumper.bin" + fout = open(tmp_bin, "wb") + fout.write(struct.pack("> j) & 0x1) << pos[j]) | bits + enc = enc & (~(1 << pos[j])) + dump_file = dump("0x{:016x}".format(enc | bits), arch) + if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1 and dump_file.find("INVALID") == -1: + line = dump_file.split("\n") + if version < 40: + line_inst = line[1].split(); + else: + line_inst = line[5].split(); + line_inst.pop(0) + logging.info("0b{:064b}".format(bits) + ": " + " ".join(line_inst)) diff --git a/Solver/src/inst.py b/Solver/src/inst.py new file mode 100644 index 0000000..3ab66c6 --- /dev/null +++ b/Solver/src/inst.py @@ -0,0 +1,42 @@ +from sets import Set + +class Inst: + def __init__(self, inst, raw = True): + # Fetech binary encoding + if raw == True: # From cuobjdump + self.__enc = inst[-2] + inst.pop(-1) + inst.pop(-1) + inst.pop(-1) + else: # From nvdisasm + self.__enc = "" + + if inst[0] == '{': # Check dual issue + inst.pop(0) + self.__pred = "" + if inst[0].find('@') != -1: # Check predicate, such as @P0 + self.__pred = inst.pop(0) + + # Remove semicolon of zero operand field instruction such as "RRO;" + ops = inst.pop(0).replace(";", "") + # Fetech opcode + self.__op = ops.split(".")[0] + # Split opcode + self.__modifier = ops.split(".")[1:] + # Fetech operands and remove ; and , + self.__operands = ' '.join(inst).replace(";", "").replace(",", "").replace("-","").replace("|","") + + def op(self): + return str(self.__op) + + def modifier(self): + return str(self.__modifier) + + def enc(self): + return str(self.__enc) + + def operands(self): + return str(self.__operands) + + def pred(self): + return str(self.__pred) diff --git a/Solver/src/modifier.py b/Solver/src/modifier.py new file mode 100644 index 0000000..50bfbb7 --- /dev/null +++ b/Solver/src/modifier.py @@ -0,0 +1,55 @@ +from inst import Inst +from dumper import dump +import enumerator +import sys +import logging + +if __name__ == "__main__": + logging.basicConfig(filename = sys.argv[3], level = logging.INFO) + logging.debug("argv[1]: Disassemble file") + logging.debug("argv[2]: Arch") + logging.debug("argv[3]: Output file") + logging.debug("argv[4]: Instruction limit (default 100)") + sass = sys.argv[1] + arch = sys.argv[2] + if len(sys.argv) >= 5: + limit = sys.argv[4] + else: + limit = 100 + count = 0 + version = int(arch.split("_")[1]) + with open(sass) as f: + for line in f: + pos = [] + count += 1 + if count == limit: + break + line_split = line.split() + # Construct instruction structure + origin = Inst(line_split) + # Find the 64-bit encodings + base = int(origin.enc(), 16) + # Bit by bit xor, observe whether opcode changes and guess what this bit represent + for i in range(0, 64): + mask = 2**i + newcode = base ^ mask + # Disassemble the new code + dump_file = dump("0x{:016x}".format(newcode), arch) + # Compare the disassemble to check which field changes: opcode, operand or modifer + if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1: + line = dump_file.split("\n") + if version < 40: + line_inst = line[1].split(); + else: + line_inst = line[5].split(); + # [0]: header info, [1] instruction part + line_inst.pop(0) + # Parse the new generated disassembly + inst = Inst(line_inst, raw = version > 40) + if inst.modifier() != origin.modifier() and inst.op() == origin.op(): + if i not in pos: + pos.append(i) + # Enumerate all modifiers + if len(pos) > 0: + logging.info("%s modifier bits %s: ", origin.op(), pos); + enumerator.enumerate(base, pos, arch) diff --git a/Solver/src/opcode.py b/Solver/src/opcode.py new file mode 100644 index 0000000..c7113df --- /dev/null +++ b/Solver/src/opcode.py @@ -0,0 +1,62 @@ +from inst import Inst +from dumper import dump +import sys +import logging + +if __name__ == "__main__": + logging.basicConfig(filename = sys.argv[3], level = logging.INFO) + logging.debug("argv[1]: Disassemble file") + logging.debug("argv[2]: Arch") + logging.debug("argv[3]: Output file") + logging.debug("argv[4]: Instruction limit (default 100)") + sass = sys.argv[1] + arch = sys.argv[2] + if len(sys.argv) >= 5: + limit = sys.argv[4] + else: + limit = 100 + count = 0; + version = int(arch.split("_")[1]) + with open(sass) as f: + for line in f: + pos = [] + bits = 0x0 + count += 1 + if count == limit: + break + line_split = line.split() + # Construct instruction structure + origin = Inst(line_split) + # Find the 64-bit encodings + base = int(origin.enc(), 16) + # Bit by bit xor, observe whether opcode changes and guess what this bit represent + for i in range(0, 64): + mask = 2**i + newcode = base ^ mask + # Disassemble the new code + dump_file = dump("0x{:016x}".format(newcode), arch) + # Compare the disassemble to check which field changes: opcode, operand or modifer + if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1: + line = dump_file.split("\n") + if version < 40: + line_inst = line[1].split(); + else: + line_inst = line[5].split(); + # [0]: header info, [1] instruction part + line_inst.pop(0) + # Parse the new generated disassembly + inst = Inst(line_inst, raw = version > 40) + # If opcode is changed, then this bit represent opcode, we find it! + # LDG and TEX are the same instructions in fact + # RED and ATOM are the same instruction + if inst.op() != origin.op() and not i in pos and not \ + (inst.op() == "LDG" and origin.op() == "TEX") and not \ + (inst.op() == "TEX" and origin.op() =="LDG") and not \ + (inst.op() == "RED" and origin.op() == "ATOM") and not \ + (inst.op() == "ATOM" and origin.op() == "RED"): + logging.info("Opcode changes: %s => %s when bit [%d] is flipped from [%d]", \ + origin.op(), inst.op(), i, (base >> i) & 0x1) + bits = bits | (((base >> i) & 0x1) << i) + pos.append(i) + if len(pos) > 0: + logging.info("0b{:064b}".format(bits) + ": %s opcode bits %s: ", origin.op(), pos); diff --git a/Solver/src/operand.py b/Solver/src/operand.py new file mode 100644 index 0000000..6b7d12a --- /dev/null +++ b/Solver/src/operand.py @@ -0,0 +1,114 @@ +from sets import Set +from inst import Inst +from dumper import dump +import sys +import logging + +ops = dict() + +def check_operand_types(inst): + operand_types = "" + operands = inst.operands().split(); + for operand in operands: + key = operand[0] + if key == 'R': # Register + value = operand[1:] + if value == 'Z' or value == 'N' or value == 'M' or \ + value == 'P' or float(value).is_integer(): + operand_types += 'R' + else: + return 'X' + elif key == 'P': # Predicate + value = operand[1:] + if float(value).is_integer(): + operand_types += 'P' + else: + return 'X' + elif key == 'c': # Constant memory + operand_types += 'C' + elif key == '[': # Memory + operand_types += 'M' + elif key == 'S': # Special register + operand_types += 'S' + else: + if len(operand) >= 2 and (operand[0:2] == "0x" or operand[0:3] == "-0x"): # Hex immediate + operand_types += 'I' + elif float(operand).is_integer(): # Immediate value + operand_types += 'I' + else: + return 'X' + if inst.op() not in ops: + ops[inst.op()] = set() + ops[inst.op()].add(operand_types) + return operand_types + elif inst.op() in ops and operand_types not in ops[inst.op()]: + ops[inst.op()].add(operand_types) + return operand_types + else: + return 'X' + +def change(inst, origin): + if inst.op() != origin.op(): + return -1 + elif inst.modifier() != origin.modifier(): + return -2 + else: + inst_operands = inst.operands().split() + origin_operands = origin.operands().split() + for i in range(len(origin_operands)): + if (inst_operands[i] != origin_operands[i]): + return i + return -3 + +if __name__ == "__main__": + logging.basicConfig(filename = sys.argv[3], level = logging.INFO) + logging.debug("argv[1]: Disassemble file") + logging.debug("argv[2]: Arch") + logging.debug("argv[3]: Output file") + logging.debug("argv[4]: Instruction limit (default 100)") + sass = sys.argv[1] + arch = sys.argv[2] + if len(sys.argv) >= 5: + limit = sys.argv[4] + else: + limit = 100 + count = 0; + version = int(arch.split("_")[1]) + with open(sys.argv[1]) as f: + for line in f: + pos = [] + count += 1 + if count == limit: + break + line_split = line.split() + # Construct instruction structure + origin = Inst(line_split) + # Find the 64-bit encodings + base = int(origin.enc(), 16) + origin_operand_types = check_operand_types(origin) + if len(origin.operands()) and origin_operand_types.find('X') == -1: + pp = [[] for i in range(len(origin_operand_types))] + logging.info(origin.op() + " " + origin.modifier()) + logging.info("0b{:064b}".format(base) + ": " + origin.operands()) + for i in range(0, 64): + mask = 2**i + newcode = base ^ mask + # Disassemble the new code + dump_file = dump("0x{:016x}".format(newcode), arch) + if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1: + line = dump_file.split("\n") + if version < 40: + line_inst = line[1].split(); + else: + line_inst = line[5].split(); + # [0]: header info, [1] instruction part + line_inst.pop(0) + inst = Inst(line_inst, raw = version > 40) + pos = change(inst, origin) + if pos >= 0: + pp[pos].append(i) + logging.info("0b{:064b}".format(newcode) + ": " + inst.operands()) + logging.info("Operand combination types: %s", origin_operand_types) + for i in range(0, len(pp)): + logging.info("Operand type: %s", origin_operand_types[i]) + logging.info("Encoding: %s", pp[i]) diff --git a/Solver/src/ptxgen.pl b/Solver/src/ptxgen.pl new file mode 100644 index 0000000..ce985b5 --- /dev/null +++ b/Solver/src/ptxgen.pl @@ -0,0 +1,339 @@ +#!/usr/bin/perl -sw +# Mass RE tool, generates large amounts of .ptx +use Data::Dumper; +use warnings; + +sub cartesian{ + my @C=[]; + foreach(reverse @_){ + #$_ is reference, @$_ is an array + my @A=@$_; + @C=map{my $n=$_; map{[$n,@$_]} @C} @A; + } + return @C; +} + +sub fprint{ + my($filename,$content)=@_; + return if not ($content); + open(FILE, ">".$filename) or die "can't open file: $filename\n"; + print FILE $content; + close FILE; +} + +sub gen_ptx{ + my $desc = shift; + my $code = ""; + $code.=".version ".$$desc{ver}."\n"; + $code.=".target ".$$desc{arch}."\n"; + $code.=".entry bench(.param .u64 I){\n"; + $code.=" .reg .b64 ptr;\n"; + $code.=" .reg .pred pi<".$$desc{pi}.">;\n" if $$desc{pi}; + $code.=" .reg .pred po<".$$desc{po}.">;\n" if $$desc{po}; + foreach my $b (8,16,32,64,128){ + my $key = "r".$b."i"; + $code.=" .reg .b".$b." ".$key."<".$$desc{$key}.">;\n" if $$desc{$key}; + } + foreach my $b (8,16,32,64,128){ + my $key = "r".$b."o"; + $code.=" .reg .b".$b." ".$key."<".$$desc{$key}.">;\n" if $$desc{$key}; + } + $code.=" ld.param.u64 ptr, [I];\n"; + $code.=" cvta.to.global.u64 ptr, ptr;\n"; + for(my $i=0;$i < ($$desc{pi}||0); $i++){ + $code.=" setp.ne.u64 pi$i, ptr, $i;\n"; + } + foreach my $b (8,16,32,64,128){ + my $key = "r".$b."i"; + for(my $i=0;$i < ($$desc{$key}||0); $i++){ + $code.=" ldu.global.b".$b." ".$key.$i.", [ptr+".($i*$b/8)."];\n"; + } + } + $code.=" ".$$desc{insn}."\n"; + foreach my $b (8,16,32,64,128){ + my $key = "r".$b."o"; + for(my $i=0;$i < ($$desc{$key}||0); $i++){ + $code.=" st.global.b".$b." [ptr+".($i*$b/8)."], ".$key.$i.";\n"; + } + } + for(my $i=0;$i < ($$desc{"po"}||0); $i++){ + $code.=" \@po$i st.global.b8 [ptr+".($i*8)."], ".$i.";\n"; + } + $code.="}\n"; + fprint($$desc{outfile},$code); +} + +my $ver = "5.0"; +my $arch = shift(@ARGV); +my $dir = shift(@ARGV); + +#shorthands +my $us8 = ["u8","s8"]; +my $bus8 = ["b8",@$us8]; +my $us16 = ["u16","s16"]; +my $bus16 = ["b16",@$us16]; +my $us32 = ["u32","s32"]; +my $bus32 = ["b32",@$us32]; +my $fus32 = ["f32",@$us32]; +my $busf32 = ["f32",@$bus32]; +my $us64 = ["u64","s64"]; +my $bus64 = ["b64",@$us64]; +my $busf64 = ["f64",@$bus64]; +my @types = (@$bus8,@$bus16,@$busf32,@$busf64,"b128"); +my $frnd = ["rn","rz","rm","rp"]; +my $irnd = ["rni","rzi","rmi","rpi"]; +my $bcmp = ["eq", "ne"]; +my $scmp = ["eq", "ne", "lt", "le", "gt", "ge"]; +my $ucmp = ["lo", "ls", "hi", "hs"]; +my $fcmp = ["equ", "neu", "ltu", "leu", "gtu", "geu", "num", "nan"]; +my $bool = ["and","or","xor"]; +my $ftz = ["ftz",""]; +my $sat = ["sat",""]; +my $shclamp= ["clamp","wrap"]; +my $lcop = ["ca","cg","cs"]; +my $lcopv = [@$lcop,"lu","cv"]; +my $scop = ["wb","cg","cs","wt"]; + +# TODO: +# try to generate instructions outside PTX ISA (add shl, cmem load-exe) +# conditional +# immediates +# cmem +# lmem +# smem +# ?offsets +my @entries=( + # Integer Arithmetic Instructions + [["mul"],["hi","lo"],$us16,["ARGS"],["r16o"],["r16i"],["r16i"]], + [["mul","mul24"],["hi","lo"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"]], + [["mul"],["hi","lo"],$us64,["ARGS"],["r64o"],["r64i"],["r64i"]], + [["mul"],["wide"],$us16,["ARGS"],["r32o"],["r16i"],["r16i"]], + [["mul"],["wide"],$us32,["ARGS"],["r64o"],["r32i"],["r32i"]], + [["mad"],["hi","lo"],$us16,["ARGS"],["r16o"],["r16i"],["r16i"],["r16i"]], + [["mad","mad24"],["hi","lo"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["mad","mad24"],["hi"],["sat"],["s32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["mad"],["hi","lo"],$us64,["ARGS"],["r64o"],["r64i"],["r64i"],["r64i"]], + [["mad"],["wide"],$us16,["ARGS"],["r32o"],["r16i"],["r16i"],["r32i"]], + [["mad"],["wide"],$us32,["ARGS"],["r64o"],["r32i"],["r32i"],["r64i"]], + [["sad"],$us16,["ARGS"],["r16o"],["r16i"],["r16i"],["r16i"]], + [["sad"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["sad"],$us64,["ARGS"],["r64o"],["r64i"],["r64i"],["r64i"]], + [["add","sub","div","rem","min","max"],$us16,["ARGS"],["r16o"],["r16i"],["r16i"]], + [["add","sub","div","rem","min","max"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"]], + [["add","sub","div","rem","min","max"],$us64,["ARGS"],["r64o"],["r64i"],["r64i"]], + [["neg","abs"],["s16"],["ARGS"],["r16o"],["r16i"]], + [["neg","abs"],["s32"],["ARGS"],["r32o"],["r32i"]], + [["neg","abs"],["s64"],["ARGS"],["r64o"],["r64i"]], + [["popc"],["b32"],["ARGS"],["r32o"],["r32i"]], + [["popc"],["b64"],["ARGS"],["r32o"],["r64i"]], + [["clz"],["b32"],["ARGS"],["r32o"],["r32i"]], + [["clz"],["b64"],["ARGS"],["r32o"],["r64i"]], + [["bfind"],["shiftamt",""],[@$us32],["ARGS"],["r32o"],["r32i"]], + [["bfind"],["shiftamt",""],[@$us64],["ARGS"],["r32o"],["r64i"]], + [["brev"],["b32"],["ARGS"],["r32o"],["r32i"]], + [["brev"],["b64"],["ARGS"],["r64o"],["r64i"]], + [["bfe"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["bfe"],$us64,["ARGS"],["r64o"],["r64i"],["r32i"],["r32i"]], + [["bfi"],["b32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"],["r32i"]], + [["bfi"],["b64"],["ARGS"],["r64o"],["r64i"],["r64i"],["r32i"],["r32i"]], + # Extended-Precision Arithmetic Instructions + [["add","addc","sub","subc"],["cc",""],$us32,["ARGS"],["r32o"],["r32i"],["r32i"]], + [["mad","madc"],["hi","lo"],["cc"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + # Floating-Point Instructions + [["testp"],["finite","infinite","number","notanumber","normal","subnormal"],["f32"],["ARGS"],["po"],["r32i"]], + [["testp"],["finite","infinite","number","notanumber","normal","subnormal"],["f64"],["ARGS"],["po"],["r64i"]], + [["copysign"],["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["copysign"],["f64"],["ARGS"],["r64o"],["r64i"],["r64i"]], + [["add","sub","mul"],$frnd,$ftz,$sat,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["add","sub","mul"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"],["r64i"]], + [["min","max"],$ftz,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["min","max"],["f64"],["ARGS"],["r64o"],["r64i"],["r64i"]], + [["fma","mad"],$frnd,$ftz,$sat,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["fma","mad"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"],["r64i"],["r64i"]], + [["div"],["approx","full",@$frnd],$ftz,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["div"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"],["r64i"]], + [["neg","abs"],$ftz,["f32"],["ARGS"],["r32o"],["r32i"]], + [["neg","abs"],["f64"],["ARGS"],["r64o"],["r64i"]], + [["rcp","sqrt"],["approx",@$frnd],$ftz,["f32"],["ARGS"],["r32o"],["r32i"]], + [["rcp"],$frnd,["ftz"],["f64"],["ARGS"],["r64o"],["r64i"]], + [["sqrt"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"]], + [["rcp"],["approx"],["ftz"],["f64"],["ARGS"],["r64o"],["r64i"]], + [["sqrt"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"]], + [["rsqrt","sin","cos","lg2","ex2"],["approx"],$ftz,["f32"],["ARGS"],["r32o"],["r32i"]], + [["rsqrt"],["approx"],["f64"],["ARGS"],["r64o"],["r64i"]], + # Comparison and Selection Instructions: + [["set"],$bcmp,$fus32,["b16"],["ARGS"],["r32o"],["r16i"],["r16i"]], + [["set"],$ucmp,$fus32,["u16"],["ARGS"],["r32o"],["r16i"],["r16i"]], + [["set"],$scmp,$fus32,["s16"],["ARGS"],["r32o"],["r16i"],["r16i"]], + [["set"],$bcmp,$fus32,["b32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["set"],$ucmp,$fus32,["u32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["set"],$scmp,$fus32,["s32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["set"],$fcmp,$ftz,$fus32,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["set"],$bcmp,$fus32,["b64"],["ARGS"],["r32o"],["r64i"],["r64i"]], + [["set"],$ucmp,$fus32,["u64"],["ARGS"],["r32o"],["r64i"],["r64i"]], + [["set"],$scmp,$fus32,["s64"],["ARGS"],["r32o"],["r64i"],["r64i"]], + [["set"],$fcmp,$fus32,["f64"],["ARGS"],["r32o"],["r64i"],["r64i"]], + [["set"],$bcmp,$bool,$fus32,["b16"],["ARGS"],["r32o"],["r16i"],["r16i"],["pi"]], + [["set"],$ucmp,$bool,$fus32,["u16"],["ARGS"],["r32o"],["r16i"],["r16i"],["pi"]], + [["set"],$scmp,$bool,$fus32,["s16"],["ARGS"],["r32o"],["r16i"],["r16i"],["pi"]], + [["set"],$bcmp,$bool,$fus32,["b32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]], + [["set"],$ucmp,$bool,$fus32,["u32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]], + [["set"],$scmp,$bool,$fus32,["s32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]], + [["set"],$fcmp,$bool,$ftz,$fus32,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]], + [["set"],$bcmp,$bool,$fus32,["b64"],["ARGS"],["r32o"],["r64i"],["r64i"],["pi"]], + [["set"],$ucmp,$bool,$fus32,["u64"],["ARGS"],["r32o"],["r64i"],["r64i"],["pi"]], + [["set"],$scmp,$bool,$fus32,["s64"],["ARGS"],["r32o"],["r64i"],["r64i"],["pi"]], + [["set"],$fcmp,$bool,$fus32,["f64"],["ARGS"],["r32o"],["r64i"],["r64i"],["pi"]], + [["setp"],$bcmp,["b16"],["ARGS"],["po"],["r16i"],["r16i"]], + [["setp"],$ucmp,["u16"],["ARGS"],["po"],["r16i"],["r16i"]], + [["setp"],$scmp,["s16"],["ARGS"],["po"],["r16i"],["r16i"]], + [["setp"],$bcmp,["b32"],["ARGS"],["po"],["r32i"],["r32i"]], + [["setp"],$ucmp,["u32"],["ARGS"],["po"],["r32i"],["r32i"]], + [["setp"],$scmp,["s32"],["ARGS"],["po"],["r32i"],["r32i"]], + [["setp"],$fcmp,$ftz,["f32"],["ARGS"],["po"],["r32i"],["r32i"]], + [["setp"],$bcmp,["b64"],["ARGS"],["po"],["r64i"],["r64i"]], + [["setp"],$ucmp,["u64"],["ARGS"],["po"],["r64i"],["r64i"]], + [["setp"],$scmp,["s64"],["ARGS"],["po"],["r64i"],["r64i"]], + [["setp"],$fcmp,["f64"],["ARGS"],["po"],["r64i"],["r64i"]], + [["setp"],$bcmp,$bool,["b16"],["ARGS"],["po"],["r16i"],["r16i"],["pi"]], + [["setp"],$ucmp,$bool,["u16"],["ARGS"],["po"],["r16i"],["r16i"],["pi"]], + [["setp"],$scmp,$bool,["s16"],["ARGS"],["po"],["r16i"],["r16i"],["pi"]], + [["setp"],$bcmp,$bool,["b32"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]], + [["setp"],$ucmp,$bool,["u32"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]], + [["setp"],$scmp,$bool,["s32"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]], + [["setp"],$fcmp,$bool,$ftz,["f32"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]], + [["setp"],$bcmp,$bool,["b64"],["ARGS"],["po"],["r64i"],["r64i"],["pi"]], + [["setp"],$ucmp,$bool,["u64"],["ARGS"],["po"],["r64i"],["r64i"],["pi"]], + [["setp"],$scmp,$bool,["s64"],["ARGS"],["po"],["r64i"],["r64i"],["pi"]], + [["setp"],$fcmp,$bool,["f64"],["ARGS"],["po"],["r64i"],["r64i"],["pi"]], + [["selp"],$bus16,["ARGS"],["r16o"],["r16i"],["r16i"],["pi"]], + [["selp"],$busf32,["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]], + [["selp"],$busf64,["ARGS"],["r64o"],["r64i"],["r64i"],["pi"]], + [["slct"],$bus16,["s32"],["ARGS"],["r16o"],["r16i"],["r16i"],["r32i"]], + [["slct"],$ftz,$bus16,["f32"],["ARGS"],["r16o"],["r16i"],["r16i"],["r32i"]], + [["slct"],$busf32,["s32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["slct"],$ftz,$busf32,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["slct"],$busf64,["s32"],["ARGS"],["r64o"],["r64i"],["r64i"],["r32i"]], + [["slct"],$ftz,$busf64,["f32"],["ARGS"],["r64o"],["r64i"],["r64i"],["r32i"]], + # Logic and Shift Instructions + [$bool,["pred"],["ARGS"],["po"],["pi"],["pi"]], + [$bool,["b16"],["ARGS"],["r16o"],["r16i"],["r16i"]], + [$bool,["b32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [$bool,["b64"],["ARGS"],["r64o"],["r64i"],["r64i"]], + [["not","cnot"],["b16"],["ARGS"],["r16o"],["r16i"]], + [["not","cnot"],["b32"],["ARGS"],["r32o"],["r32i"]], + [["not","cnot"],["b64"],["ARGS"],["r64o"],["r64i"]], + [["shf"],["l","r"],$shclamp,["b32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["shl"],["b16"],["ARGS"],["r16o"],["r16i"],["r32i"]], + [["shl"],["b32"],["ARGS"],["r32o"],["r32i"],["r32i"]], + [["shl"],["b64"],["ARGS"],["r64o"],["r64i"],["r32i"]], + [["shr"],$bus16,["ARGS"],["r16o"],["r16i"],["r32i"]], + [["shr"],$bus32,["ARGS"],["r32o"],["r32i"],["r32i"]], + [["shr"],$bus64,["ARGS"],["r64o"],["r64i"],["r32i"]], + # Data Movement and Conversion Instructions + [["mov"],["pred"],["ARGS"],["po"],["pi"]], + [["mov"],$bus16,["ARGS"],["r16o"],["r16i"]], + [["mov"],$bus32,["ARGS"],["r32o"],["r32i"]], + [["mov"],$busf32,["ARGS"],["r32o"],["r32i"]], + [["mov"],$busf64,["ARGS"],["r64o"],["r64i"]], + # TODO: vector, sreg + [["shfl"],["up","down","bfly","idx"],["b32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["prmt"],["b32"],["f4e","b4e","rc8","ecl","ecr","rc16"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]], + [["ld"],["const","global","local","param","shared"],$lcopv,$bus8,["ARGS"],["r8o"],["ptr"]], + [["ld"],["const","global","local","param","shared"],$lcopv,$bus16,["ARGS"],["r16o"],["ptr"]], + [["ld"],["const","global","local","param","shared"],$lcopv,$busf32,["ARGS"],["r32o"],["ptr"]], + [["ld"],["const","global","local","param","shared"],$lcopv,$busf64,["ARGS"],["r64o"],["ptr"]], + [["ld"],["volatile"],["global","shared"],$bus8,["ARGS"],["r8o"],["ptr"]], + [["ld"],["volatile"],["global","shared"],$bus16,["ARGS"],["r16o"],["ptr"]], + [["ld"],["volatile"],["global","shared"],$busf32,["ARGS"],["r32o"],["ptr"]], + [["ld"],["volatile"],["global","shared"],$busf64,["ARGS"],["r64o"],["ptr"]], + # TODO: vector ld + [["ld"],["global"],$lcop,["nc"],$bus8,["ARGS"],["r8o"],["ptr"]], + [["ld"],["global"],$lcop,["nc"],$bus16,["ARGS"],["r16o"],["ptr"]], + [["ld"],["global"],$lcop,["nc"],$busf32,["ARGS"],["r32o"],["ptr"]], + [["ld"],["global"],$lcop,["nc"],$busf64,["ARGS"],["r64o"],["ptr"]], + [["ldu"],["global"],$bus8,["ARGS"],["r8o"],["ptr"]], + [["ldu"],["global"],$bus16,["ARGS"],["r16o"],["ptr"]], + [["ldu"],["global"],$busf32,["ARGS"],["r32o"],["ptr"]], + [["ldu"],["global"],$busf64,["ARGS"],["r64o"],["ptr"]], + [["st"],["global","local","param","shared"],$scop,$bus8,["ARGS"],["ptr"],["r8i"]], + [["st"],["global","local","param","shared"],$scop,$bus16,["ARGS"],["ptr"],["r16i"]], + [["st"],["global","local","param","shared"],$scop,$busf32,["ARGS"],["ptr"],["r32i"]], + [["st"],["global","local","param","shared"],$scop,$busf64,["ARGS"],["ptr"],["r64i"]], + [["st"],["global","local","param","shared"],$scop,$bus8,["ARGS"],["ptr"],["r8i"]], + [["st"],["volatile"],["global","shared"],$bus8,["ARGS"],["ptr"],["r8i"]], + [["st"],["volatile"],["global","shared"],$bus16,["ARGS"],["ptr"],["r16i"]], + [["st"],["volatile"],["global","shared"],$busf32,["ARGS"],["ptr"],["r32i"]], + [["st"],["volatile"],["global","shared"],$busf64,["ARGS"],["ptr"],["r64i"]], + [["prefetch"],["L1","L2"],["global","local"],["ARGS"],["ptr"]], + [["prefetchu"],["L1"],["ARGS"],["ptr"]], + [["isspacep"],["global","local","const","shared"],["ARGS"],["po"],["r32i"]], + [["cvta"],["global","local","const","shared"],["u32"],["ARGS"],["r32o"],["r32i"]], + [["cvta"],["global","local","const","shared"],["u64"],["ARGS"],["r64o"],["r64i"]], + # NOTE: skipped most of cvts + [["cvt"],$us8,$us32,["ARGS"],["r8o"],["r32i"]], + [["cvt"],$irnd,$ftz,$sat,$us32,["f32"],["ARGS"],["r32o"],["r32i"]], + [["cvt"],$frnd,$ftz,$sat,["f32"],$us32,["ARGS"],["r32o"],["r32i"]], + # TODO: Texture Instructions + # TODO: Surface Instructions + # TODO: Control Flow Instructions + # Parallel Synchronization and Communication Instructions + [["bar"],["sync","arrive"],["ARGS"],["r32o"],["r32i"]], + [["bar"],["red"],["popc"],["u32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]], + [["bar"],["red"],["and","or"],["pred"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]], + [["membar"],["cta","gl","sys"]], + [["atom"],["global","shared"],["and","or","xor","exch"],["b32"],["ARGS"],["r32o"],["ptr"],["r32i"]], + [["atom"],["global","shared"],["cas"],["b32"],["ARGS"],["r32o"],["ptr"],["r32i"],["r32i"]], + [["atom"],["global","shared"],["add"],$fus32,["ARGS"],["r32o"],["ptr"],["r32i"]], + [["atom"],["global","shared"],["inc","dec"],["u32"],["ARGS"],["r32o"],["ptr"],["r32i"]], + [["atom"],["global","shared"],["min","max"],$us32,["ARGS"],["r32o"],["ptr"],["r32i"]], + [["atom"],["global","shared"],["and","or","xor","exch"],["b64"],["ARGS"],["r64o"],["ptr"],["r64i"]], + [["atom"],["global","shared"],["cas"],["b64"],["ARGS"],["r64o"],["ptr"],["r64i"],["r64i"]], + [["atom"],["global","shared"],["min","max"],$us64,["ARGS"],["r64o"],["ptr"],["r64i"]], + [["atom"],["global","shared"],["add"],["u64"],["ARGS"],["r64o"],["ptr"],["r64i"]], + [["vote"],["all","any","uni"],["pred"],["ARGS"],["po"],["pi"]], + [["vote"],["ballot"],["b32"],["ARGS"],["r32o"],["pi"]], +); + +foreach $entry (@entries){ + my @op_descs = cartesian(@$entry); + foreach $op_desc (@op_descs){ + my $name = ""; + my $insn = ""; + my %desc = (ver=>$ver,arch=>$arch,dir=>$dir); + my $args = 0; + foreach $field (@$op_desc){ + next if $field eq ""; + if($field eq "ARGS"){ + chop($insn); # chop . at the end of $insn + $args = 1; + next; + } + foreach (@types){ + if ($_ eq $field){ + $desc{type}=$field; + last; + } + } + $name.=$field."_"; + if($args){ + if($field eq "ptr"){ + $insn.=" [ptr],"; + } + else{ + $insn.=" ".$field.($desc{$field}++).","; + } + } + else{ + $insn.=$field."."; + } + } + chop($name); + $name.=".ptx"; + chop($insn); + $insn.=";"; + $desc{insn}=$insn; + $desc{outfile}=$dir.$name; + gen_ptx(\%desc); + } +} diff --git a/Solver/src/test.cu b/Solver/src/test.cu new file mode 100644 index 0000000..cf0a568 --- /dev/null +++ b/Solver/src/test.cu @@ -0,0 +1,4 @@ +__global__ void test(float& a, float& b) { + do { + } while(1); +} diff --git a/Solver/src/unique.py b/Solver/src/unique.py new file mode 100644 index 0000000..36df1a7 --- /dev/null +++ b/Solver/src/unique.py @@ -0,0 +1,14 @@ +from sets import Set +from inst import Inst +import subprocess +import sys + +if __name__ == "__main__": + opset = Set([]) + with open(sys.argv[1]) as f: + for line in f: + field = line.split() + inst = Inst(field, False) + if not inst.op() in opset: + opset.add(inst.op()) + sys.stdout.write(line)