packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch

From 224c1919ad3f68e23e817f41036687343f34aaae Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Fri, 12 Jun 2015 17:27:47 +0100
Subject: [PATCH] [utils] Disable fast_memcpy which is slower than memcpy

The default glibc memcpy is likely to be better tuned than this code
which hasn't been touched for four years.

In a test with software video decode on Pi2 the skipped frames went
from 189 to 172 when fast_memcpy was disabled.
---
 Kodi.xcodeproj/project.pbxproj                     |   6 -
 project/VS2010Express/XBMC.vcxproj                 |   4 -
 project/VS2010Express/XBMC.vcxproj.filters         |   3 -
 xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp    |   1 -
 xbmc/cores/VideoRenderers/RenderCapture.cpp        |   7 +-
 xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp   |  33 +-
 .../Video/libstagefrightICS/StageFrightVideo.cpp   |   3 +-
 xbmc/utils/Makefile.in                             |   2 -
 xbmc/utils/fastmemcpy-arm.S                        | 528 ---------------------
 xbmc/utils/fastmemcpy.c                            | 396 ----------------
 xbmc/utils/fastmemcpy.h                            |  35 --
 xbmc/utils/test/Makefile                           |   1 -
 xbmc/utils/test/Testfastmemcpy.cpp                 |  39 --
 13 files changed, 20 insertions(+), 1038 deletions(-)
 delete mode 100644 xbmc/utils/fastmemcpy-arm.S
 delete mode 100644 xbmc/utils/fastmemcpy.c
 delete mode 100644 xbmc/utils/fastmemcpy.h
 delete mode 100644 xbmc/utils/test/Testfastmemcpy.cpp

diff --git a/Kodi.xcodeproj/project.pbxproj b/Kodi.xcodeproj/project.pbxproj
index 395c4ea..ce5a7f7 100644
--- a/Kodi.xcodeproj/project.pbxproj
+++ b/Kodi.xcodeproj/project.pbxproj
@@ -3192,7 +3192,6 @@
 		F5E55B5D10741272006E788A /* DVDPlayerTeletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B5B10741272006E788A /* DVDPlayerTeletext.cpp */; };
 		F5E55B66107412DE006E788A /* GUIDialogTeletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B65107412DE006E788A /* GUIDialogTeletext.cpp */; };
 		F5E55B7010741340006E788A /* Teletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B6E10741340006E788A /* Teletext.cpp */; };
-		F5E5697310803FC3006E788A /* fastmemcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = F5E5697210803FC3006E788A /* fastmemcpy.c */; };
 		F5E56BA61082A675006E788A /* PosixMountProvider.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E56BA51082A675006E788A /* PosixMountProvider.cpp */; };
 		F5EA02260F6DA990005C2EC5 /* CocoaPowerSyscall.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5EA02200F6DA85C005C2EC5 /* CocoaPowerSyscall.cpp */; };
 		F5EA02270F6DA9A5005C2EC5 /* PowerManager.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5EA021A0F6DA7E8005C2EC5 /* PowerManager.cpp */; };
@@ -3632,7 +3631,6 @@
 		43348AAB1077486D00F859CF /* PlayerSelectionRule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PlayerSelectionRule.h; path = playercorefactory/PlayerSelectionRule.h; sourceTree = "<group>"; };
 		436721A612D66A09002508E6 /* IAnnouncer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = IAnnouncer.h; sourceTree = "<group>"; };
 		436B38F3106628850049AB3B /* EndianSwap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = EndianSwap.h; sourceTree = "<group>"; };
-		43BF09DD1080D39300E25290 /* fastmemcpy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fastmemcpy.h; sourceTree = "<group>"; };
 		43FAC87112D6349400F67914 /* IStorageProvider.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = IStorageProvider.h; sourceTree = "<group>"; };
 		551C3A43175A12010051AAAD /* VDA.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = VDA.cpp; sourceTree = "<group>"; };
 		551C3A44175A12010051AAAD /* VDA.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = VDA.h; sourceTree = "<group>"; };
@@ -5735,7 +5733,6 @@
 		F5E55B6D10741340006E788A /* Teletext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Teletext.h; sourceTree = "<group>"; };
 		F5E55B6E10741340006E788A /* Teletext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Teletext.cpp; sourceTree = "<group>"; };
 		F5E55B6F10741340006E788A /* TeletextDefines.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TeletextDefines.h; sourceTree = "<group>"; };
-		F5E5697210803FC3006E788A /* fastmemcpy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fastmemcpy.c; sourceTree = "<group>"; };
 		F5E56BA41082A675006E788A /* PosixMountProvider.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PosixMountProvider.h; sourceTree = "<group>"; };
 		F5E56BA51082A675006E788A /* PosixMountProvider.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PosixMountProvider.cpp; sourceTree = "<group>"; };
 		F5EA021A0F6DA7E8005C2EC5 /* PowerManager.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PowerManager.cpp; sourceTree = "<group>"; };
@@ -9202,8 +9199,6 @@
 				DF529BAD1741697B00523FB4 /* Environment.h */,
 				E36C29E90DA72486001F0C9D /* Fanart.cpp */,
 				6E97BDC30DA2B620003A2A89 /* Fanart.h */,
-				F5E5697210803FC3006E788A /* fastmemcpy.c */,
-				43BF09DD1080D39300E25290 /* fastmemcpy.h */,
 				F5F244641110DC6B009126C6 /* FileOperationJob.cpp */,
 				F5F244631110DC6B009126C6 /* FileOperationJob.h */,
 				F5F245EC1112C9AB009126C6 /* FileUtils.cpp */,
@@ -10519,7 +10514,6 @@
 				43348AAE1077486D00F859CF /* PlayerCoreFactory.cpp in Sources */,
 				43348AAF1077486D00F859CF /* PlayerSelectionRule.cpp in Sources */,
 				7CAA20511079C8160096DE39 /* BaseRenderer.cpp in Sources */,
-				F5E5697310803FC3006E788A /* fastmemcpy.c in Sources */,
 				55D3604E1826CAB900DA66D2 /* OverlayRendererGUI.cpp in Sources */,
 				F5E56BA61082A675006E788A /* PosixMountProvider.cpp in Sources */,
 				7CAA25351085963B0096DE39 /* PasswordManager.cpp in Sources */,
diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj
index 2d37c57..e8e8dce 100644
--- a/project/VS2010Express/XBMC.vcxproj
+++ b/project/VS2010Express/XBMC.vcxproj
@@ -1439,10 +1439,6 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
     </ClCompile>
-    <ClCompile Include="..\..\xbmc\utils\test\Testfastmemcpy.cpp">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-    </ClCompile>
     <ClCompile Include="..\..\xbmc\utils\TimeSmoother.cpp" />
     <ClCompile Include="..\..\xbmc\utils\TimeUtils.cpp" />
     <ClCompile Include="..\..\xbmc\utils\URIUtils.cpp" />
diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters
index c858f32..cada31e 100644
--- a/project/VS2010Express/XBMC.vcxproj.filters
+++ b/project/VS2010Express/XBMC.vcxproj.filters
@@ -2371,9 +2371,6 @@
     <ClCompile Include="..\..\xbmc\utils\test\TestEndianSwap.cpp">
       <Filter>utils\test</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\xbmc\utils\test\Testfastmemcpy.cpp">
-      <Filter>utils\test</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\xbmc\utils\test\TestFileOperationJob.cpp">
       <Filter>utils\test</Filter>
     </ClCompile>
diff --git a/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp b/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
index 2b64121..fdad7f0 100644
--- a/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
+++ b/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
@@ -31,7 +31,6 @@
 #include <locale.h>
 #include "guilib/MatrixGLES.h"
 #include "LinuxRendererGLES.h"
-#include "utils/fastmemcpy.h"
 #include "utils/MathUtils.h"
 #include "utils/GLUtils.h"
 #include "utils/log.h"
diff --git a/xbmc/cores/VideoRenderers/RenderCapture.cpp b/xbmc/cores/VideoRenderers/RenderCapture.cpp
index 603b68d..0456a27 100644
--- a/xbmc/cores/VideoRenderers/RenderCapture.cpp
+++ b/xbmc/cores/VideoRenderers/RenderCapture.cpp
@@ -21,7 +21,6 @@
 #include "RenderCapture.h"
 #include "utils/log.h"
 #include "windowing/WindowingFactory.h"
-#include "utils/fastmemcpy.h"
 #include "settings/AdvancedSettings.h"
 
 CRenderCaptureBase::CRenderCaptureBase()
@@ -297,7 +296,7 @@ void CRenderCaptureGL::PboToBuffer()
 
   if (pboPtr)
   {
-    fast_memcpy(m_pixels, pboPtr, m_bufferSize);
+    memcpy(m_pixels, pboPtr, m_bufferSize);
     SetState(CAPTURESTATE_DONE);
   }
   else
@@ -491,12 +490,12 @@ void CRenderCaptureDX::SurfaceToBuffer()
     //if pitch is same, do a direct copy, otherwise copy one line at a time
     if (lockedRect.Pitch == m_width * 4)
     {
-      fast_memcpy(m_pixels, lockedRect.pBits, m_width * m_height * 4);
+      memcpy(m_pixels, lockedRect.pBits, m_width * m_height * 4);
     }
     else
     {
       for (unsigned int y = 0; y < m_height; y++)
-        fast_memcpy(m_pixels + y * m_width * 4, (uint8_t*)lockedRect.pBits + y * lockedRect.Pitch, m_width * 4);
+        memcpy(m_pixels + y * m_width * 4, (uint8_t*)lockedRect.pBits + y * lockedRect.Pitch, m_width * 4);
     }
     m_copySurface->UnlockRect();
     SetState(CAPTURESTATE_DONE);
diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
index 56e68713..5f0e486 100644
--- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
+++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
@@ -22,7 +22,6 @@
 #include "DVDClock.h"
 #include "cores/VideoRenderers/RenderManager.h"
 #include "utils/log.h"
-#include "utils/fastmemcpy.h"
 #include "cores/FFmpeg.h"
 #include "Util.h"
 #ifdef HAS_DX
@@ -95,7 +94,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
 
   for (int y = 0; y < h; y++)
   {
-    fast_memcpy(d, s, w);
+    memcpy(d, s, w);
     s += pSrc->iLineSize[0];
     d += pDst->iLineSize[0];
   }
@@ -107,7 +106,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
   d = pDst->data[1];
   for (int y = 0; y < h; y++)
   {
-    fast_memcpy(d, s, w);
+    memcpy(d, s, w);
     s += pSrc->iLineSize[1];
     d += pDst->iLineSize[1];
   }
@@ -116,7 +115,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
   d = pDst->data[2];
   for (int y = 0; y < h; y++)
   {
-    fast_memcpy(d, s, w);
+    memcpy(d, s, w);
     s += pSrc->iLineSize[2];
     d += pDst->iLineSize[2];
   }
@@ -131,13 +130,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
   int h = pImage->height;
   if ((w == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
   {
-    fast_memcpy(d, s, w*h);
+    memcpy(d, s, w*h);
   }
   else
   {
     for (int y = 0; y < h; y++)
     {
-      fast_memcpy(d, s, w);
+      memcpy(d, s, w);
       s += pSrc->iLineSize[0];
       d += pImage->stride[0];
     }
@@ -148,13 +147,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
   h =(pImage->height >> pImage->cshift_y);
   if ((w==pSrc->iLineSize[1]) && ((unsigned int) pSrc->iLineSize[1]==pImage->stride[1]))
   {
-    fast_memcpy(d, s, w*h);
+    memcpy(d, s, w*h);
   }
   else
   {
     for (int y = 0; y < h; y++)
     {
-      fast_memcpy(d, s, w);
+      memcpy(d, s, w);
       s += pSrc->iLineSize[1];
       d += pImage->stride[1];
     }
@@ -163,13 +162,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
   d = pImage->plane[2];
   if ((w==pSrc->iLineSize[2]) && ((unsigned int) pSrc->iLineSize[2]==pImage->stride[2]))
   {
-    fast_memcpy(d, s, w*h);
+    memcpy(d, s, w*h);
   }
   else
   {
     for (int y = 0; y < h; y++)
     {
-      fast_memcpy(d, s, w);
+      memcpy(d, s, w);
       s += pSrc->iLineSize[2];
       d += pImage->stride[2];
     }
@@ -207,7 +206,7 @@ DVDVideoPicture* CDVDCodecUtils::ConvertToNV12Picture(DVDVideoPicture *pSrc)
       uint8_t *d = pPicture->data[0];
       for (int y = 0; y < (int)pSrc->iHeight; y++)
       {
-        fast_memcpy(d, s, pSrc->iWidth);
+        memcpy(d, s, pSrc->iWidth);
         s += pSrc->iLineSize[0];
         d += pPicture->iLineSize[0];
       }
@@ -298,13 +297,13 @@ bool CDVDCodecUtils::CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc)
   // Copy Y
   if ((w == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
   {
-    fast_memcpy(d, s, w*h);
+    memcpy(d, s, w*h);
   }
   else
   {
     for (int y = 0; y < h; y++)
     {
-      fast_memcpy(d, s, w);
+      memcpy(d, s, w);
       s += pSrc->iLineSize[0];
       d += pImage->stride[0];
     }
@@ -317,13 +316,13 @@ bool CDVDCodecUtils::CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc)
   // Copy packed UV (width is same as for Y as it's both U and V components)
   if ((w==pSrc->iLineSize[1]) && ((unsigned int) pSrc->iLineSize[1]==pImage->stride[1]))
   {
-    fast_memcpy(d, s, w*h);
+    memcpy(d, s, w*h);
   }
   else
   {
     for (int y = 0; y < h; y++)
     {
-      fast_memcpy(d, s, w);
+      memcpy(d, s, w);
       s += pSrc->iLineSize[1];
       d += pImage->stride[1];
     }
@@ -342,13 +341,13 @@ bool CDVDCodecUtils::CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture
   // Copy YUYV
   if ((w * 2 == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
   {
-    fast_memcpy(d, s, w*h*2);
+    memcpy(d, s, w*h*2);
   }
   else
   {
     for (int y = 0; y < h; y++)
     {
-      fast_memcpy(d, s, w*2);
+      memcpy(d, s, w*2);
       s += pSrc->iLineSize[0];
       d += pImage->stride[0];
     }
diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
index 019bc7a..d5ca74f 100644
--- a/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
@@ -30,7 +30,6 @@
 #include "guilib/GraphicContext.h"
 #include "DVDClock.h"
 #include "utils/log.h"
-#include "utils/fastmemcpy.h"
 #include "threads/Thread.h"
 #include "threads/Event.h"
 #include "Application.h"
@@ -620,7 +619,7 @@ int  CStageFrightVideo::Decode(uint8_t *pData, int iSize, double dts, double pts
       return VC_ERROR;
     }
 
-    fast_memcpy(frame->medbuf->data(), demuxer_content, demuxer_bytes);
+    memcpy(frame->medbuf->data(), demuxer_content, demuxer_bytes);
     frame->medbuf->set_range(0, demuxer_bytes);
     frame->medbuf->meta_data()->clear();
     frame->medbuf->meta_data()->setInt64(kKeyTime, frame->pts);
diff --git a/xbmc/utils/Makefile.in b/xbmc/utils/Makefile.in
index 438f025..dbd3db9 100644
--- a/xbmc/utils/Makefile.in
+++ b/xbmc/utils/Makefile.in
@@ -17,8 +17,6 @@ SRCS += DatabaseUtils.cpp
 SRCS += EndianSwap.cpp
 SRCS += Environment.cpp
 SRCS += Fanart.cpp
-SRCS += fastmemcpy.c
-SRCS += fastmemcpy-arm.S
 SRCS += FileOperationJob.cpp
 SRCS += FileUtils.cpp
 SRCS += fstrcmp.c
diff --git a/xbmc/utils/fastmemcpy-arm.S b/xbmc/utils/fastmemcpy-arm.S
deleted file mode 100644
index 6cb8b0c..0000000
--- a/xbmc/utils/fastmemcpy-arm.S
+++ /dev/null
@@ -1,528 +0,0 @@
-/*
- *      Copyright (C) 2008 The Android Open Source Project
- *      All rights reserved.
- *
- *      Copyright (C) 2011-2013 Team XBMC
- *      http://xbmc.org
- *
- *  This Program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This Program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with XBMC; see the file COPYING.  If not, see
- *  <http://www.gnu.org/licenses/>.
- *
- */
-#if defined(__arm__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS)
-#if defined(__ARM_NEON__)
-
-        .text
-#ifndef __APPLE__
-        .fpu    neon
-        .global fast_memcpy
-        .type fast_memcpy, %function
-#else
-        .globl _fast_memcpy
-#endif
-        .align 4
-
-/* a prefetch distance of 4 cache-lines works best experimentally */
-#define CACHE_LINE_SIZE     64
-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
-
-#ifndef __APPLE__
-        .fnstart
-        .save       {r0, lr}
-fast_memcpy:
-#else
-_fast_memcpy:
-#endif
-        stmfd       sp!, {r0, lr}
-
-        /* start preloading as early as possible */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
-        pld         [r1, #(CACHE_LINE_SIZE*1)]
-
-        /* do we have at least 16-bytes to copy (needed for alignment below) */
-        cmp         r2, #16
-        blo         5f
-
-        /* align destination to half cache-line for the write-buffer */
-        rsb         r3, r0, #0
-        ands        r3, r3, #0xF
-        beq         0f
-
-        /* copy up to 15-bytes (count in r3) */
-        sub         r2, r2, r3
-        movs        ip, r3, lsl #31
-        ldrmib      lr, [r1], #1
-        strmib      lr, [r0], #1
-        ldrcsb      ip, [r1], #1
-        ldrcsb      lr, [r1], #1
-        strcsb      ip, [r0], #1
-        strcsb      lr, [r0], #1
-        movs        ip, r3, lsl #29
-        bge         1f
-        // copies 4 bytes, destination 32-bits aligned
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
-1:      bcc         2f
-        // copies 8 bytes, destination 64-bits aligned
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0, :64]!
-2:
-
-0:      /* preload immediately the next cache line, which we may need */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
-        pld         [r1, #(CACHE_LINE_SIZE*1)]
-
-        /* make sure we have at least 64 bytes to copy */
-        subs        r2, r2, #64
-        blo         2f
-
-        /* preload all the cache lines we need.
-         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
-         * ideally would would increase the distance in the main loop to
-         * avoid the goofy code below. In practice this doesn't seem to make
-         * a big difference.
-         */
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
-        pld         [r1, #(CACHE_LINE_SIZE*3)]
-        pld         [r1, #(PREFETCH_DISTANCE)]
-
-1:      /* The main loop copies 64 bytes at a time */
-        vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(PREFETCH_DISTANCE)]
-        subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
-        vst1.8      {d4  - d7},   [r0, :128]!
-        bhs         1b
-
-2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
-        add         r2, r2, #64
-        subs        r2, r2, #32
-        blo         4f
-
-3:      /* 32 bytes at a time. These cache lines were already preloaded */
-        vld1.8      {d0 - d3},  [r1]!
-        subs        r2, r2, #32
-        vst1.8      {d0 - d3},  [r0, :128]!
-        bhs         3b
-
-4:      /* less than 32 left */
-        add         r2, r2, #32
-        tst         r2, #0x10
-        beq         5f
-        // copies 16 bytes, 128-bits aligned
-        vld1.8      {d0, d1}, [r1]!
-        vst1.8      {d0, d1}, [r0, :128]!
-
-5:      /* copy up to 15-bytes (count in r2) */
-        movs        ip, r2, lsl #29
-        bcc         1f
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0]!
-1:      bge         2f
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
-2:      movs        ip, r2, lsl #31
-        ldrmib      r3, [r1], #1
-        ldrcsb      ip, [r1], #1
-        ldrcsb      lr, [r1], #1
-        strmib      r3, [r0], #1
-        strcsb      ip, [r0], #1
-        strcsb      lr, [r0], #1
-
-        ldmfd       sp!, {r0, lr}
-        bx          lr
-#ifndef __APPLE__
-        .fnend
-#endif
-
-#else   /* __ARM_ARCH__ < 7 */
-
-
-	.text
-
-#ifndef __APPLE__
-    .global fast_memcpy
-    .type fast_memcpy, %function
-#else
-    .globl _fast_memcpy
-#endif
-    .align 4
-
-		/*
-		 * Optimized memcpy() for ARM.
-         *
-		 * note that memcpy() always returns the destination pointer,
-		 * so we have to preserve R0.
-		 */
-
-#ifndef __APPLE__
-fast_memcpy:
-#else
-_fast_memcpy:
-#endif
-		/* The stack must always be 64-bits aligned to be compliant with the
-		 * ARM ABI. Since we have to save R0, we might as well save R4
-		 * which we can use for better pipelining of the reads below
-		 */
-#ifndef __APPLE__
-        .fnstart
-        .save       {r0, r4, lr}
-#endif
-        stmfd       sp!, {r0, r4, lr}
-        /* Making room for r5-r11 which will be spilled later */
-        .pad        #28
-        sub         sp, sp, #28
-
-        // preload the destination because we'll align it to a cache line
-        // with small writes. Also start the source "pump".
-        //PLD         (r0, #0)
-        //PLD         (r1, #0)
-        //PLD         (r1, #32)
-
-		/* it simplifies things to take care of len<4 early */
-		cmp			r2, #4
-		blo			copy_last_3_and_return
-
-		/* compute the offset to align the source
-		 * offset = (4-(src&3))&3 = -src & 3
-		 */
-		rsb			r3, r1, #0
-		ands		r3, r3, #3
-		beq			src_aligned
-
-		/* align source to 32 bits. We need to insert 2 instructions between
-		 * a ldr[b|h] and str[b|h] because byte and half-word instructions
-		 * stall 2 cycles.
-		 */
-		movs		r12, r3, lsl #31
-		sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */
-		ldrmib		r3, [r1], #1
-		ldrcsb		r4, [r1], #1
-		ldrcsb		r12,[r1], #1
-        strmib		r3, [r0], #1
-		strcsb		r4, [r0], #1
-		strcsb		r12,[r0], #1
-
-src_aligned:
-
-		/* see if src and dst are aligned together (congruent) */
-		eor			r12, r0, r1
-		tst			r12, #3
-		bne			non_congruent
-
-        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
-         * frame. Don't update sp.
-         */
-        stmea		sp, {r5-r11}
-
-		/* align the destination to a cache-line */
-		rsb         r3, r0, #0
-		ands		r3, r3, #0x1C
-		beq         congruent_aligned32
-		cmp         r3, r2
-		andhi		r3, r2, #0x1C
-
-		/* conditionnaly copies 0 to 7 words (length in r3) */
-		movs		r12, r3, lsl #28
-		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
-		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
-		stmcsia		r0!, {r4, r5, r6, r7}
-		stmmiia		r0!, {r8, r9}
-		tst         r3, #0x4
-		ldrne		r10,[r1], #4			/*  4 bytes */
-		strne		r10,[r0], #4
-		sub         r2, r2, r3
-
-congruent_aligned32:
-		/*
-		 * here source is aligned to 32 bytes.
-		 */
-
-cached_aligned32:
-        subs        r2, r2, #32
-        blo         less_than_32_left
-
-        /*
-         * We preload a cache-line up to 64 bytes ahead. On the 926, this will
-         * stall only until the requested world is fetched, but the linefill
-         * continues in the the background.
-         * While the linefill is going, we write our previous cache-line
-         * into the write-buffer (which should have some free space).
-         * When the linefill is done, the writebuffer will
-         * start dumping its content into memory
-         *
-         * While all this is going, we then load a full cache line into
-         * 8 registers, this cache line should be in the cache by now
-         * (or partly in the cache).
-         *
-         * This code should work well regardless of the source/dest alignment.
-         *
-         */
-
-        // Align the preload register to a cache-line because the cpu does
-        // "critical word first" (the first word requested is loaded first).
-        bic         r12, r1, #0x1F
-        add         r12, r12, #64
-
-1:      ldmia       r1!, { r4-r11 }
-        //PLD         (r12, #64)
-        subs        r2, r2, #32
-
-        // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
-        // for ARM9 preload will not be safely guarded by the preceding subs.
-        // When it is safely guarded the only possibility to have SIGSEGV here
-        // is because the caller overstates the length.
-        ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
-        stmia       r0!, { r4-r11 }
-		bhs         1b
-
-        add         r2, r2, #32
-
-
-
-
-less_than_32_left:
-		/*
-		 * less than 32 bytes left at this point (length in r2)
-		 */
-
-		/* skip all this if there is nothing to do, which should
-		 * be a common case (if not executed the code below takes
-		 * about 16 cycles)
-		 */
-		tst			r2, #0x1F
-		beq			1f
-
-		/* conditionnaly copies 0 to 31 bytes */
-		movs		r12, r2, lsl #28
-		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
-		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
-		stmcsia		r0!, {r4, r5, r6, r7}
-		stmmiia		r0!, {r8, r9}
-		movs		r12, r2, lsl #30
-		ldrcs		r3, [r1], #4			/*  4 bytes */
-		ldrmih		r4, [r1], #2			/*  2 bytes */
-		strcs		r3, [r0], #4
-		strmih		r4, [r0], #2
-		tst         r2, #0x1
-		ldrneb		r3, [r1]				/*  last byte  */
-		strneb		r3, [r0]
-
-		/* we're done! restore everything and return */
-1:		ldmfd		sp!, {r5-r11}
-		ldmfd		sp!, {r0, r4, lr}
-		bx			lr
-
-		/********************************************************************/
-
-non_congruent:
-		/*
-		 * here source is aligned to 4 bytes
-		 * but destination is not.
-		 *
-		 * in the code below r2 is the number of bytes read
-		 * (the number of bytes written is always smaller, because we have
-		 * partial words in the shift queue)
-		 */
-		cmp			r2, #4
-		blo			copy_last_3_and_return
-
-        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
-         * frame. Don't update sp.
-         */
-        stmea		sp, {r5-r11}
-
-		/* compute shifts needed to align src to dest */
-		rsb			r5, r0, #0
-		and			r5, r5, #3			/* r5 = # bytes in partial words */
-		mov			r12, r5, lsl #3		/* r12 = right */
-		rsb			lr, r12, #32		/* lr = left  */
-
-		/* read the first word */
-		ldr			r3, [r1], #4
-		sub			r2, r2, #4
-
-		/* write a partial word (0 to 3 bytes), such that destination
-		 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
-		 */
-		movs		r5, r5, lsl #31
-		strmib		r3, [r0], #1
-		movmi		r3, r3, lsr #8
-		strcsb		r3, [r0], #1
-		movcs		r3, r3, lsr #8
-		strcsb		r3, [r0], #1
-		movcs		r3, r3, lsr #8
-
-		cmp			r2, #4
-		blo			partial_word_tail
-
-		/* Align destination to 32 bytes (cache line boundary) */
-1:		tst			r0, #0x1c
-		beq			2f
-		ldr			r5, [r1], #4
-		sub         r2, r2, #4
-		orr			r4, r3, r5,		lsl lr
-		mov			r3, r5,			lsr r12
-		str			r4, [r0], #4
-        cmp         r2, #4
-		bhs			1b
-		blo			partial_word_tail
-
-		/* copy 32 bytes at a time */
-2:		subs		r2, r2, #32
-		blo			less_than_thirtytwo
-
-		/* Use immediate mode for the shifts, because there is an extra cycle
-		 * for register shifts, which could account for up to 50% of
-		 * performance hit.
-		 */
-
-        cmp			r12, #24
-		beq			loop24
-		cmp			r12, #8
-		beq			loop8
-
-loop16:
-        ldr         r12, [r1], #4
-1:      mov         r4, r12
-		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        //PLD         (r1, #64)
-        subs        r2, r2, #32
-        ldrhs       r12, [r1], #4
-		orr			r3, r3, r4,		lsl #16
-		mov			r4, r4,			lsr #16
-		orr			r4, r4, r5,		lsl #16
-		mov			r5, r5,			lsr #16
-		orr			r5, r5, r6,		lsl #16
-		mov			r6, r6,			lsr #16
-		orr			r6, r6, r7,		lsl #16
-		mov			r7, r7,			lsr #16
-		orr			r7, r7, r8,		lsl #16
-		mov			r8, r8,			lsr #16
-		orr			r8, r8, r9,		lsl #16
-		mov			r9, r9,			lsr #16
-		orr			r9, r9, r10,	lsl #16
-		mov			r10, r10,		lsr #16
-		orr			r10, r10, r11,	lsl #16
-		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
-		mov			r3, r11,		lsr #16
-		bhs			1b
-		b			less_than_thirtytwo
-
-loop8:
-        ldr         r12, [r1], #4
-1:      mov         r4, r12
-		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        //PLD         (r1, #64)
-		subs		r2, r2, #32
-        ldrhs       r12, [r1], #4
-		orr			r3, r3, r4,		lsl #24
-		mov			r4, r4,			lsr #8
-		orr			r4, r4, r5,		lsl #24
-		mov			r5, r5,			lsr #8
-		orr			r5, r5, r6,		lsl #24
-		mov			r6, r6,			lsr #8
-		orr			r6, r6, r7,		lsl #24
-		mov			r7, r7,			lsr #8
-		orr			r7, r7, r8,		lsl #24
-		mov			r8, r8,			lsr #8
-		orr			r8, r8, r9,		lsl #24
-		mov			r9, r9,			lsr #8
-		orr			r9, r9, r10,	lsl #24
-		mov			r10, r10,		lsr #8
-		orr			r10, r10, r11,	lsl #24
-		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
-		mov			r3, r11,		lsr #8
-		bhs			1b
-		b			less_than_thirtytwo
-
-loop24:
-        ldr         r12, [r1], #4
-1:      mov         r4, r12
-		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        //PLD         (r1, #64)
-		subs		r2, r2, #32
-        ldrhs       r12, [r1], #4
-		orr			r3, r3, r4,		lsl #8
-		mov			r4, r4,			lsr #24
-		orr			r4, r4, r5,		lsl #8
-		mov			r5, r5,			lsr #24
-		orr			r5, r5, r6,		lsl #8
-		mov			r6, r6,			lsr #24
-		orr			r6, r6, r7,		lsl #8
-		mov			r7, r7,			lsr #24
-		orr			r7, r7, r8,		lsl #8
-		mov			r8, r8,			lsr #24
-		orr			r8, r8, r9,		lsl #8
-		mov			r9, r9,			lsr #24
-		orr			r9, r9, r10,	lsl #8
-		mov			r10, r10,		lsr #24
-		orr			r10, r10, r11,	lsl #8
-		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
-		mov			r3, r11,		lsr #24
-		bhs			1b
-
-
-less_than_thirtytwo:
-		/* copy the last 0 to 31 bytes of the source */
-		rsb			r12, lr, #32		/* we corrupted r12, recompute it  */
-		add			r2, r2, #32
-		cmp			r2, #4
-		blo			partial_word_tail
-
-1:		ldr			r5, [r1], #4
-		sub         r2, r2, #4
-		orr			r4, r3, r5,		lsl lr
-		mov			r3,	r5,			lsr r12
-		str			r4, [r0], #4
-        cmp         r2, #4
-		bhs			1b
-
-partial_word_tail:
-		/* we have a partial word in the input buffer */
-		movs		r5, lr, lsl #(31-3)
-		strmib		r3, [r0], #1
-		movmi		r3, r3, lsr #8
-		strcsb		r3, [r0], #1
-		movcs		r3, r3, lsr #8
-		strcsb		r3, [r0], #1
-
-		/* Refill spilled registers from the stack. Don't update sp. */
-		ldmfd		sp, {r5-r11}
-
-copy_last_3_and_return:
-		movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */
-		ldrmib		r2, [r1], #1
-		ldrcsb		r3, [r1], #1
-		ldrcsb		r12,[r1]
-		strmib		r2, [r0], #1
-		strcsb		r3, [r0], #1
-		strcsb		r12,[r0]
-
-        /* we're done! restore sp and spilled registers and return */
-        add         sp,  sp, #28
-		ldmfd		sp!, {r0, r4, lr}
-		bx			lr
-#ifndef __APPLE__
-        .fnend
-#endif
-
-#endif    /* __ARM_ARCH__ < 7 */
-#endif
-
-#if defined(__linux__) && defined(__ELF__)
-/* we don't need an executable stack */
-.section .note.GNU-stack,"",%progbits
-#endif
diff --git a/xbmc/utils/fastmemcpy.c b/xbmc/utils/fastmemcpy.c
deleted file mode 100644
index ec9019a..0000000
--- a/xbmc/utils/fastmemcpy.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
- * fastmemcpy.h : fast memcpy routines
- *****************************************************************************
- *      $Id: fastmemcpy.h 13905 2006-01-12 23:10:04Z dionoea $
- *
- *      Authors: various Linux kernel hackers
- *               various MPlayer hackers
- *               Nick Kurshev <nickols_k@mail.ru>
- *
- *      Copyright (C) 2011-2013 Team XBMC
- *      http://xbmc.org
- *
- *  This Program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This Program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with XBMC; see the file COPYING.  If not, see
- *  <http://www.gnu.org/licenses/>.
- *
- */
-#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__mips__)
-#define HAVE_MMX2
-#define HAVE_SSE
-
-/*
-  aclib - advanced C library ;)
-  This file contains functions which improve and expand standard C-library
-*/
-#include <stddef.h>
-
-#define BLOCK_SIZE 4096
-#define CONFUSION_FACTOR 0
-/*Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)*/
-
-/*#define STATISTICS*/
-
-#ifndef HAVE_SSE2
-/*
-   P3 processor has only one SSE decoder so can execute only 1 sse insn per
-   cpu clock, but it has 3 mmx decoders (include load/store unit)
-   and executes 3 mmx insns per cpu clock.
-   P4 processor has some chances, but after reading:
-   http://www.emulators.com/pentium4.htm
-   I have doubts. Anyway SSE2 version of this code can be written better.
-*/
-#undef HAVE_SSE
-#endif
-
-
-/*
- This part of code was taken by me from Linux-2.4.3 and slightly modified
-for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
-blocks but mplayer uses weakly ordered data and original sources can not
-speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
-
->From IA-32 Intel Architecture Software Developer's Manual Volume 1,
-
-Order Number 245470:
-"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
-
-Data referenced by a program can be temporal (data will be used again) or
-non-temporal (data will be referenced once and not reused in the immediate
-future). To make efficient use of the processor's caches, it is generally
-desirable to cache temporal data and not cache non-temporal data. Overloading
-the processor's caches with non-temporal data is sometimes referred to as
-"polluting the caches".
-The non-temporal data is written to memory with Write-Combining semantics.
-
-The PREFETCHh instructions permits a program to load data into the processor
-at a suggested cache level, so that it is closer to the processors load and
-store unit when it is needed. If the data is already present in a level of
-the cache hierarchy that is closer to the processor, the PREFETCHh instruction
-will not result in any data movement.
-But we should you PREFETCHNTA: Non-temporal data fetch data into location
-close to the processor, minimizing cache pollution.
-
-The MOVNTQ (store quadword using non-temporal hint) instruction stores
-packed integer data from an MMX register to memory, using a non-temporal hint.
-The MOVNTPS (store packed single-precision floating-point values using
-non-temporal hint) instruction stores packed floating-point data from an
-XMM register to memory, using a non-temporal hint.
-
-The SFENCE (Store Fence) instruction controls write ordering by creating a
-fence for memory store operations. This instruction guarantees that the results
-of every store instruction that precedes the store fence in program order is
-globally visible before any store instruction that follows the fence. The
-SFENCE instruction provides an efficient way of ensuring ordering between
-procedures that produce weakly-ordered data and procedures that consume that
-data.
-
-If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
-*/
-
-/* 3dnow memcpy support from kernel 2.4.2 */
-/*  by Pontscho/fresh!mindworkz           */
-
-#if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
-
-#undef HAVE_MMX1
-#if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
-/*  means: mmx v.1. Note: Since we added alignment of destinition it speedups
-    of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
-    standard (non MMX-optimized) version.
-    Note: on K6-2+ it speedups memory copying upto 25% and
-          on K7 and P3 about 500% (5 times). */
-#define HAVE_MMX1
-#endif
-
-
-#undef HAVE_K6_2PLUS
-#if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
-#define HAVE_K6_2PLUS
-#endif
-
-/* for small memory blocks (<256 bytes) this version is faster */
-#define small_memcpy(to,from,n)\
-{\
-register unsigned long int dummy;\
-__asm__ __volatile__(\
-	"rep; movsb"\
-	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
-/* It's most portable way to notify compiler */\
-/* that edi, esi and ecx are clobbered in asm block. */\
-/* Thanks to A'rpi for hint!!! */\
-        :"0" (to), "1" (from),"2" (n)\
-	: "memory");\
-}
-
-#ifdef HAVE_SSE
-#define MMREG_SIZE 16
-#else
-#define MMREG_SIZE 64 /*8*/
-#endif
-
-/* Small defines (for readability only) ;) */
-#ifdef HAVE_K6_2PLUS
-#define PREFETCH "prefetch"
-/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
-#define EMMS     "femms"
-#else
-#define PREFETCH "prefetchnta"
-#define EMMS     "emms"
-#endif
-
-#ifdef HAVE_MMX2
-#define MOVNTQ "movntq"
-#else
-#define MOVNTQ "movq"
-#endif
-
-#ifdef HAVE_MMX1
-#define MIN_LEN 0x800  /* 2K blocks */
-#else
-#define MIN_LEN 0x40  /* 64-byte blocks */
-#endif
-
-void * fast_memcpy(void * to, const void * from, size_t len)
-{
-	void *retval;
-	size_t i;
-	retval = to;
-#ifdef STATISTICS
-	{
-		static int freq[33];
-		static int t=0;
-		int i;
-		for(i=0; len>(1<<i); i++);
-		freq[i]++;
-		t++;
-		if(1024*1024*1024 % t == 0)
-			for(i=0; i<32; i++)
-				printf("freq < %8d %4d\n", 1<<i, freq[i]);
-	}
-#endif
-#ifndef HAVE_MMX1
-        /* PREFETCH has effect even for MOVSB instruction ;) */
-	__asm__ __volatile__ (
-	        PREFETCH" (%0)\n"
-	        PREFETCH" 64(%0)\n"
-	        PREFETCH" 128(%0)\n"
-        	PREFETCH" 192(%0)\n"
-        	PREFETCH" 256(%0)\n"
-		: : "r" (from) );
-#endif
-        if(len >= MIN_LEN)
-	{
-	  register unsigned long int delta;
-          /* Align destinition to MMREG_SIZE -boundary */
-          delta = ((unsigned long int)to)&(MMREG_SIZE-1);
-          if(delta)
-	  {
-	    delta=MMREG_SIZE-delta;
-	    len -= delta;
-	    small_memcpy(to, from, delta);
-	  }
-	  i = len >> 6; /* len/64 */
-	  len&=63;
-        /*
-           This algorithm is top effective when the code consequently
-           reads and writes blocks which have size of cache line.
-           Size of cache line is processor-dependent.
-           It will, however, be a minimum of 32 bytes on any processors.
-           It would be better to have a number of instructions which
-           perform reading and writing to be multiple to a number of
-           processor's decoders, but it's not always possible.
-        */
-#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
-	if(((unsigned long)from) & 15)
-	/* if SRC is misaligned */
-	for(; i>0; i--)
-	{
-		__asm__ __volatile__ (
-		PREFETCH" 320(%0)\n"
-		"movups (%0), %%xmm0\n"
-		"movups 16(%0), %%xmm1\n"
-		"movups 32(%0), %%xmm2\n"
-		"movups 48(%0), %%xmm3\n"
-		"movntps %%xmm0, (%1)\n"
-		"movntps %%xmm1, 16(%1)\n"
-		"movntps %%xmm2, 32(%1)\n"
-		"movntps %%xmm3, 48(%1)\n"
-		:: "r" (from), "r" (to) : "memory");
-		((const unsigned char *)from)+=64;
-		((unsigned char *)to)+=64;
-	}
-	else
-	/*
-	   Only if SRC is aligned on 16-byte boundary.
-	   It allows to use movaps instead of movups, which required data
-	   to be aligned or a general-protection exception (#GP) is generated.
-	*/
-	for(; i>0; i--)
-	{
-		__asm__ __volatile__ (
-		PREFETCH" 320(%0)\n"
-		"movaps (%0), %%xmm0\n"
-		"movaps 16(%0), %%xmm1\n"
-		"movaps 32(%0), %%xmm2\n"
-		"movaps 48(%0), %%xmm3\n"
-		"movntps %%xmm0, (%1)\n"
-		"movntps %%xmm1, 16(%1)\n"
-		"movntps %%xmm2, 32(%1)\n"
-		"movntps %%xmm3, 48(%1)\n"
-		:: "r" (from), "r" (to) : "memory");
-		((const unsigned char *)from)+=64;
-		((unsigned char *)to)+=64;
-	}
-#else
-	/* Align destination at BLOCK_SIZE boundary */
-	for(; ((ptrdiff_t)to & (BLOCK_SIZE-1)) && i>0; i--)
-	{
-		__asm__ __volatile__ (
-#ifndef HAVE_MMX1
-        	PREFETCH" 320(%0)\n"
-#endif
-		"movq (%0), %%mm0\n"
-		"movq 8(%0), %%mm1\n"
-		"movq 16(%0), %%mm2\n"
-		"movq 24(%0), %%mm3\n"
-		"movq 32(%0), %%mm4\n"
-		"movq 40(%0), %%mm5\n"
-		"movq 48(%0), %%mm6\n"
-		"movq 56(%0), %%mm7\n"
-		MOVNTQ" %%mm0, (%1)\n"
-		MOVNTQ" %%mm1, 8(%1)\n"
-		MOVNTQ" %%mm2, 16(%1)\n"
-		MOVNTQ" %%mm3, 24(%1)\n"
-		MOVNTQ" %%mm4, 32(%1)\n"
-		MOVNTQ" %%mm5, 40(%1)\n"
-		MOVNTQ" %%mm6, 48(%1)\n"
-		MOVNTQ" %%mm7, 56(%1)\n"
-		:: "r" (from), "r" (to) : "memory");
-                from = (const void *) (((const unsigned char *)from)+64);
-		to = (void *) (((unsigned char *)to)+64);
-	}
-
-/*	printf(" %p %p\n", (ptrdiff_t)from&1023, (ptrdiff_t)to&1023); */
-	/* Pure Assembly cuz gcc is a bit unpredictable ;) */
-# if 0
-	if(i>=BLOCK_SIZE/64)
-		asm volatile(
-			"xorl %%eax, %%eax	\n\t"
-			".balign 16		\n\t"
-			"1:			\n\t"
-				"movl (%0, %%eax), %%ebx 	\n\t"
-				"movl 32(%0, %%eax), %%ebx 	\n\t"
-				"movl 64(%0, %%eax), %%ebx 	\n\t"
-				"movl 96(%0, %%eax), %%ebx 	\n\t"
-				"addl $128, %%eax		\n\t"
-				"cmpl %3, %%eax			\n\t"
-				" jb 1b				\n\t"
-
-			"xorl %%eax, %%eax	\n\t"
-
-				".balign 16		\n\t"
-				"2:			\n\t"
-				"movq (%0, %%eax), %%mm0\n"
-				"movq 8(%0, %%eax), %%mm1\n"
-				"movq 16(%0, %%eax), %%mm2\n"
-				"movq 24(%0, %%eax), %%mm3\n"
-				"movq 32(%0, %%eax), %%mm4\n"
-				"movq 40(%0, %%eax), %%mm5\n"
-				"movq 48(%0, %%eax), %%mm6\n"
-				"movq 56(%0, %%eax), %%mm7\n"
-				MOVNTQ" %%mm0, (%1, %%eax)\n"
-				MOVNTQ" %%mm1, 8(%1, %%eax)\n"
-				MOVNTQ" %%mm2, 16(%1, %%eax)\n"
-				MOVNTQ" %%mm3, 24(%1, %%eax)\n"
-				MOVNTQ" %%mm4, 32(%1, %%eax)\n"
-				MOVNTQ" %%mm5, 40(%1, %%eax)\n"
-				MOVNTQ" %%mm6, 48(%1, %%eax)\n"
-				MOVNTQ" %%mm7, 56(%1, %%eax)\n"
-				"addl $64, %%eax		\n\t"
-				"cmpl %3, %%eax		\n\t"
-				"jb 2b				\n\t"
-
-#if CONFUSION_FACTOR > 0
-	/* a few percent speedup on out of order executing CPUs */
-			"movl %5, %%eax		\n\t"
-				"2:			\n\t"
-				"movl (%0), %%ebx	\n\t"
-				"movl (%0), %%ebx	\n\t"
-				"movl (%0), %%ebx	\n\t"
-				"movl (%0), %%ebx	\n\t"
-				"decl %%eax		\n\t"
-				" jnz 2b		\n\t"
-#endif
-
-			"xorl %%eax, %%eax	\n\t"
-			"addl %3, %0		\n\t"
-			"addl %3, %1		\n\t"
-			"subl %4, %2		\n\t"
-			"cmpl %4, %2		\n\t"
-			" jae 1b		\n\t"
-				: "+r" (from), "+r" (to), "+r" (i)
-				: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
-				: "%eax", "%ebx"
-		);
-#endif
-
-	for(; i>0; i--)
-	{
-		__asm__ __volatile__ (
-#ifndef HAVE_MMX1
-        	PREFETCH" 320(%0)\n"
-#endif
-		"movq (%0), %%mm0\n"
-		"movq 8(%0), %%mm1\n"
-		"movq 16(%0), %%mm2\n"
-		"movq 24(%0), %%mm3\n"
-		"movq 32(%0), %%mm4\n"
-		"movq 40(%0), %%mm5\n"
-		"movq 48(%0), %%mm6\n"
-		"movq 56(%0), %%mm7\n"
-		MOVNTQ" %%mm0, (%1)\n"
-		MOVNTQ" %%mm1, 8(%1)\n"
-		MOVNTQ" %%mm2, 16(%1)\n"
-		MOVNTQ" %%mm3, 24(%1)\n"
-		MOVNTQ" %%mm4, 32(%1)\n"
-		MOVNTQ" %%mm5, 40(%1)\n"
-		MOVNTQ" %%mm6, 48(%1)\n"
-		MOVNTQ" %%mm7, 56(%1)\n"
-		:: "r" (from), "r" (to) : "memory");
-		from = (const void *) (((const unsigned char *)from)+64);
-		to = (void *) (((unsigned char *)to)+64);
-	}
-
-#endif /* Have SSE */
-#ifdef HAVE_MMX2
-                /* since movntq is weakly-ordered, a "sfence"
-		 * is needed to become ordered again. */
-		__asm__ __volatile__ ("sfence":::"memory");
-#endif
-#ifndef HAVE_SSE
-		/* enables to use FPU */
-		__asm__ __volatile__ (EMMS:::"memory");
-#endif
-	}
-	/*
-	 *	Now do the tail of the block
-	 */
-	if(len) small_memcpy(to, from, len);
-	return retval;
-}
-
-
-#endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */
-
-#endif
diff --git a/xbmc/utils/fastmemcpy.h b/xbmc/utils/fastmemcpy.h
deleted file mode 100644
index 43f5904..0000000
--- a/xbmc/utils/fastmemcpy.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- *      Copyright (C) 2005-2013 Team XBMC
- *      http://xbmc.org
- *
- *  This Program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This Program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with XBMC; see the file COPYING.  If not, see
- *  <http://www.gnu.org/licenses/>.
- *
- */
-#pragma once
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS)
-void * fast_memcpy(void * to, const void * from, size_t len);
-//#define fast_memcpy memcpy
-#else
-#define fast_memcpy memcpy
-#endif
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/xbmc/utils/test/Makefile b/xbmc/utils/test/Makefile
index 8fa0526..3a467ad 100644
--- a/xbmc/utils/test/Makefile
+++ b/xbmc/utils/test/Makefile
@@ -11,7 +11,6 @@ SRCS=	\
 	TestCryptThreading.cpp \
 	TestDatabaseUtils.cpp \
 	TestEndianSwap.cpp \
-	Testfastmemcpy.cpp \
 	TestFileOperationJob.cpp \
 	TestFileUtils.cpp \
 	Testfstrcmp.cpp \
diff --git a/xbmc/utils/test/Testfastmemcpy.cpp b/xbmc/utils/test/Testfastmemcpy.cpp
deleted file mode 100644
index 93a9bb0..0000000
--- a/xbmc/utils/test/Testfastmemcpy.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *      Copyright (C) 2005-2013 Team XBMC
- *      http://xbmc.org
- *
- *  This Program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This Program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with XBMC; see the file COPYING.  If not, see
- *  <http://www.gnu.org/licenses/>.
- *
- */
-
-#include <stddef.h> // TODO: This should go in fastmemcpy.h instead.
-#include "utils/fastmemcpy.h"
-
-#include "gtest/gtest.h"
-
-static const char refdata[] = "\x01\x02\x03\x04\x05\x06\x07\x08"
-                              "\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"
-                              "\x11\x12\x13\x14\x15\x16\x17\x18"
-                              "\x19\x1a\x1b\x1c\x1d\x1e\x1f\x20"
-                              "\x21\x22\x23\x24\x25\x26\x27\x28"
-                              "\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30";
-
-TEST(Testfastmemcpy, General)
-{
-  char vardata[sizeof(refdata)];
-  memset(vardata, 0, sizeof(vardata));
-  EXPECT_NE(nullptr, fast_memcpy(vardata, refdata, sizeof(refdata)));
-  EXPECT_EQ(0, memcmp(refdata, vardata, sizeof(refdata)));
-}