Skip to content
Permalink
Browse files

Fix CMSAR1 execution to use correct multiplier

Fix ILW/ISW/LQ/SQ on microVU for reading VU1 regs

Marvel Nemesis - Rise of the Imperfects goes ingame now, but it's quite messy
  • Loading branch information
refractionpcsx2 committed Dec 30, 2019
1 parent 90b0e7a commit 10dd9412a1743c81f50538137aa727ed4bb73fda
Showing with 93 additions and 94 deletions.
  1. +1 −1 pcsx2/VU0.cpp
  2. +91 −93 pcsx2/x86/microVU_Lower.inl
  3. +1 −0 pcsx2/x86/microVU_Macro.inl
@@ -167,7 +167,7 @@ void CTC2() {
break;
case REG_CMSAR1: // REG_CMSAR1
if (!(VU0.VI[REG_VPU_STAT].UL & 0x100) ) {
vu1ExecMicro(cpuRegs.GPR.r[_Rt_].US[0]); // Execute VU1 Micro SubRoutine
vu1ExecMicro(cpuRegs.GPR.r[_Rt_].US[0] * 8); // Execute VU1 Micro SubRoutine
vif1VUFinish();
}
break;
@@ -153,8 +153,8 @@ mVUop(mVU_RSQRT) {
#define EATANhelper(addr) { \
SSE_MULSS(mVU, t2, Fs); \
SSE_MULSS(mVU, t2, Fs); \
xMOVAPS (t1, t2); \
xMUL.SS (t1, ptr32[addr]); \
xMOVAPS (t1, t2); \
xMUL.SS (t1, ptr32[addr]); \
SSE_ADDSS(mVU, PQ, t1); \
}

@@ -182,7 +182,7 @@ mVUop(mVU_EATAN) {
const xmm& t2 = mVU.regAlloc->allocReg();
xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xMOVSS (xmmPQ, Fs);
xSUB.SS(Fs, ptr32[mVUglob.one]);
xSUB.SS(Fs, ptr32[mVUglob.one]);
xADD.SS(xmmPQ, ptr32[mVUglob.one]);
SSE_DIVSS(mVU, Fs, xmmPQ);
mVU_EATAN_(mVU, xmmPQ, Fs, t1, t2);
@@ -238,8 +238,8 @@ mVUop(mVU_EATANxz) {

#define eexpHelper(addr) { \
SSE_MULSS(mVU, t2, Fs); \
xMOVAPS (t1, t2); \
xMUL.SS (t1, ptr32[addr]); \
xMOVAPS (t1, t2); \
xMUL.SS (t1, ptr32[addr]); \
SSE_ADDSS(mVU, xmmPQ, t1); \
}

@@ -253,22 +253,22 @@ mVUop(mVU_EEXP) {
xMOVSS (xmmPQ, Fs);
xMUL.SS (xmmPQ, ptr32[mVUglob.E1]);
xADD.SS (xmmPQ, ptr32[mVUglob.one]);
xMOVAPS (t1, Fs);
xMOVAPS (t1, Fs);
SSE_MULSS(mVU, t1, Fs);
xMOVAPS (t2, t1);
xMUL.SS (t1, ptr32[mVUglob.E2]);
xMOVAPS (t2, t1);
xMUL.SS (t1, ptr32[mVUglob.E2]);
SSE_ADDSS(mVU, xmmPQ, t1);
eexpHelper(&mVUglob.E3);
eexpHelper(&mVUglob.E4);
eexpHelper(&mVUglob.E5);
SSE_MULSS(mVU, t2, Fs);
xMUL.SS (t2, ptr32[mVUglob.E6]);
xMUL.SS (t2, ptr32[mVUglob.E6]);
SSE_ADDSS(mVU, xmmPQ, t2);
SSE_MULSS(mVU, xmmPQ, xmmPQ);
SSE_MULSS(mVU, xmmPQ, xmmPQ);
xMOVSSZX (t2, ptr32[mVUglob.one]);
xMOVSSZX (t2, ptr32[mVUglob.one]);
SSE_DIVSS(mVU, t2, xmmPQ);
xMOVSS (xmmPQ, t2);
xMOVSS (xmmPQ, t2);
xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU.regAlloc->clearNeeded(Fs);
mVU.regAlloc->clearNeeded(t1);
@@ -285,23 +285,23 @@ static __fi void mVU_sumXYZ(mV, const xmm& PQ, const xmm& Fs) {
xMOVSS(PQ, Fs);
}
else {
SSE_MULPS(mVU, Fs, Fs); // wzyx ^ 2
xMOVSS (PQ, Fs); // x ^ 2
xPSHUF.D (Fs, Fs, 0xe1); // wzyx -> wzxy
SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2
xPSHUF.D (Fs, Fs, 0xd2); // wzxy -> wxyz
SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2 + z ^ 2
SSE_MULPS(mVU, Fs, Fs); // wzyx ^ 2
xMOVSS (PQ, Fs); // x ^ 2
xPSHUF.D (Fs, Fs, 0xe1); // wzyx -> wzxy
SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2
xPSHUF.D (Fs, Fs, 0xd2); // wzxy -> wxyz
SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2 + z ^ 2
}
}

mVUop(mVU_ELENG) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); }
pass2 {
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ(mVU, xmmPQ, Fs);
xSQRT.SS (xmmPQ, xmmPQ);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
xSQRT.SS (xmmPQ, xmmPQ);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU.regAlloc->clearNeeded(Fs);
mVU.profiler.EmitOp(opELENG);
}
@@ -312,12 +312,12 @@ mVUop(mVU_ERCPR) {
pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 12); }
pass2 {
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xMOVSS (xmmPQ, Fs);
xMOVSSZX (Fs, ptr32[mVUglob.one]);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xMOVSS (xmmPQ, Fs);
xMOVSSZX (Fs, ptr32[mVUglob.one]);
SSE_DIVSS(mVU, Fs, xmmPQ);
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU.regAlloc->clearNeeded(Fs);
mVU.profiler.EmitOp(opERCPR);
}
@@ -328,13 +328,13 @@ mVUop(mVU_ERLENG) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 24); }
pass2 {
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ(mVU, xmmPQ, Fs);
xSQRT.SS (xmmPQ, xmmPQ);
xMOVSSZX (Fs, ptr32[mVUglob.one]);
xSQRT.SS (xmmPQ, xmmPQ);
xMOVSSZX (Fs, ptr32[mVUglob.one]);
SSE_DIVSS (mVU, Fs, xmmPQ);
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU.regAlloc->clearNeeded(Fs);
mVU.profiler.EmitOp(opERLENG);
}
@@ -345,12 +345,12 @@ mVUop(mVU_ERSADD) {
pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); }
pass2 {
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
mVU_sumXYZ(mVU, xmmPQ, Fs);
xMOVSSZX (Fs, ptr32[mVUglob.one]);
xMOVSSZX (Fs, ptr32[mVUglob.one]);
SSE_DIVSS (mVU, Fs, xmmPQ);
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU.regAlloc->clearNeeded(Fs);
mVU.profiler.EmitOp(opERSADD);
}
@@ -361,13 +361,13 @@ mVUop(mVU_ERSQRT) {
pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 18); }
pass2 {
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xAND.PS (Fs, ptr128[mVUglob.absclip]);
xSQRT.SS (xmmPQ, Fs);
xMOVSSZX (Fs, ptr32[mVUglob.one]);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xAND.PS (Fs, ptr128[mVUglob.absclip]);
xSQRT.SS (xmmPQ, Fs);
xMOVSSZX (Fs, ptr32[mVUglob.one]);
SSE_DIVSS(mVU, Fs, xmmPQ);
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU.regAlloc->clearNeeded(Fs);
mVU.profiler.EmitOp(opERSQRT);
}
@@ -393,29 +393,29 @@ mVUop(mVU_ESIN) {
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
const xmm& t1 = mVU.regAlloc->allocReg();
const xmm& t2 = mVU.regAlloc->allocReg();
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xMOVSS (xmmPQ, Fs); // pq = X
SSE_MULSS(mVU, Fs, Fs); // fs = X^2
xMOVAPS (t1, Fs); // t1 = X^2
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xMOVSS (xmmPQ, Fs); // pq = X
SSE_MULSS(mVU, Fs, Fs); // fs = X^2
xMOVAPS (t1, Fs); // t1 = X^2
SSE_MULSS(mVU, Fs, xmmPQ); // fs = X^3
xMOVAPS (t2, Fs); // t2 = X^3
xMUL.SS (Fs, ptr32[mVUglob.S2]); // fs = s2 * X^3
xMOVAPS (t2, Fs); // t2 = X^3
xMUL.SS (Fs, ptr32[mVUglob.S2]); // fs = s2 * X^3
SSE_ADDSS(mVU, xmmPQ, Fs); // pq = X + s2 * X^3

SSE_MULSS(mVU, t2, t1); // t2 = X^3 * X^2
xMOVAPS (Fs, t2); // fs = X^5
xMUL.SS (Fs, ptr32[mVUglob.S3]); // ps = s3 * X^5
SSE_MULSS(mVU, t2, t1); // t2 = X^3 * X^2
xMOVAPS (Fs, t2); // fs = X^5
xMUL.SS (Fs, ptr32[mVUglob.S3]); // ps = s3 * X^5
SSE_ADDSS(mVU, xmmPQ, Fs); // pq = X + s2 * X^3 + s3 * X^5

SSE_MULSS(mVU, t2, t1); // t2 = X^5 * X^2
xMOVAPS (Fs, t2); // fs = X^7
xMUL.SS (Fs, ptr32[mVUglob.S4]); // fs = s4 * X^7
SSE_MULSS(mVU, t2, t1); // t2 = X^5 * X^2
xMOVAPS (Fs, t2); // fs = X^7
xMUL.SS (Fs, ptr32[mVUglob.S4]); // fs = s4 * X^7
SSE_ADDSS(mVU, xmmPQ, Fs); // pq = X + s2 * X^3 + s3 * X^5 + s4 * X^7

SSE_MULSS(mVU, t2, t1); // t2 = X^7 * X^2
xMUL.SS (t2, ptr32[mVUglob.S5]); // t2 = s5 * X^9
SSE_MULSS(mVU, t2, t1); // t2 = X^7 * X^2
xMUL.SS (t2, ptr32[mVUglob.S5]); // t2 = s5 * X^9
SSE_ADDSS(mVU, xmmPQ, t2); // pq = X + s2 * X^3 + s3 * X^5 + s4 * X^7 + s5 * X^9
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU.regAlloc->clearNeeded(Fs);
mVU.regAlloc->clearNeeded(t1);
mVU.regAlloc->clearNeeded(t2);
@@ -443,13 +443,13 @@ mVUop(mVU_ESUM) {
pass2 {
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
const xmm& t1 = mVU.regAlloc->allocReg();
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xPSHUF.D (t1, Fs, 0x1b);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
xPSHUF.D (t1, Fs, 0x1b);
SSE_ADDPS(mVU, Fs, t1);
xPSHUF.D (t1, Fs, 0x01);
xPSHUF.D (t1, Fs, 0x01);
SSE_ADDSS(mVU, Fs, t1);
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
xMOVSS (xmmPQ, Fs);
xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
mVU.regAlloc->clearNeeded(Fs);
mVU.regAlloc->clearNeeded(t1);
mVU.profiler.EmitOp(opESUM);
@@ -842,15 +842,14 @@ mVUop(mVU_ILW) {
}
pass2 {
xAddressVoid ptr(mVU.regs().Mem + offsetSS);
if (_Is_) {
mVUallocVIa(mVU, gprT2, _Is_);
xADD(gprT2, _Imm11_);
mVUaddrFix (mVU, gprT2);
ptr += gprT2;
}
else {
ptr += getVUmem(_Imm11_);
}

mVUallocVIa(mVU, gprT2, _Is_);
if (!_Is_)
xXOR(gprT2, gprT2);
xADD(gprT2, _Imm11_);
mVUaddrFix (mVU, gprT2);
ptr += gprT2;

xMOVZX(gprT1, ptr16[ptr]);
mVUallocVIb(mVU, gprT1, _It_);
mVU.profiler.EmitOp(opILW);
@@ -891,14 +890,14 @@ mVUop(mVU_ISW) {
}
pass2 {
xAddressVoid ptr(mVU.regs().Mem);
if (_Is_) {
mVUallocVIa(mVU, gprT2, _Is_);
xADD(gprT2, _Imm11_);
mVUaddrFix (mVU, gprT2);
ptr += gprT2;
}
else
ptr += getVUmem(_Imm11_);

mVUallocVIa(mVU, gprT2, _Is_);
if (!_Is_)
xXOR(gprT2, gprT2);
xADD(gprT2, _Imm11_);
mVUaddrFix (mVU, gprT2);
ptr += gprT2;

mVUallocVIa(mVU, gprT1, _It_);
if (_X) xMOV(ptr32[ptr], gprT1);
if (_Y) xMOV(ptr32[ptr+4], gprT1);
@@ -938,14 +937,13 @@ mVUop(mVU_LQ) {
pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, false); }
pass2 {
xAddressVoid ptr(mVU.regs().Mem);
if (_Is_) {
mVUallocVIa(mVU, gprT2, _Is_);
xADD(gprT2, _Imm11_);
mVUaddrFix(mVU, gprT2);
ptr += gprT2;
}
else
ptr += getVUmem(_Imm11_);
mVUallocVIa(mVU, gprT2, _Is_);
if (!_Is_)
xXOR(gprT2, gprT2);
xADD(gprT2, _Imm11_);
mVUaddrFix(mVU, gprT2);
ptr += gprT2;

const xmm& Ft = mVU.regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
mVUloadReg(Ft, ptr, _X_Y_Z_W);
mVU.regAlloc->clearNeeded(Ft);
@@ -1006,14 +1004,14 @@ mVUop(mVU_SQ) {
pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, false); }
pass2 {
xAddressVoid ptr(mVU.regs().Mem);
if (_It_) {
mVUallocVIa(mVU, gprT2, _It_);
xADD(gprT2, _Imm11_);
mVUaddrFix(mVU, gprT2);
ptr += gprT2;
}
else
ptr += getVUmem(_Imm11_);

mVUallocVIa(mVU, gprT2, _It_);
if (!_It_)
xXOR(gprT2, gprT2);
xADD(gprT2, _Imm11_);
mVUaddrFix(mVU, gprT2);
ptr += gprT2;

const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
mVUsaveReg(Fs, ptr, _X_Y_Z_W, 1);
mVU.regAlloc->clearNeeded(Fs);
@@ -347,6 +347,7 @@ static void recCTC2() {
case REG_CMSAR1: // Execute VU1 Micro SubRoutine
if (_Rt_) {
xMOV(ecx, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
xSHL(ecx, 3);
}
else xXOR(ecx, ecx);
xFastCall((void*)vu1ExecMicro, ecx);

0 comments on commit 10dd941

Please sign in to comment.
You can’t perform that action at this time.