Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge from upstream #67

Merged
merged 24 commits into from
Jul 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7b9d891
Fix integral type dispatch error message (#9625)
vishwakftw Jul 20, 2018
f84fdc7
Remove unnecessary functions from StorageDerived.h
cpuhrsch Jul 20, 2018
bbb30ad
Use THTensor/Storage for THVoidTensor/Storage (#9588)
cpuhrsch Jul 20, 2018
b9e89cf
Revert "Extend DispatchStub to support CUDA dispatch (#9579)" (#9614)
colesbury Jul 20, 2018
1d4d9fc
Prepare to stop using attributes in the JIT (#9505)
apaszke Jul 20, 2018
a3a6ab6
Fix the error in UnpackSegmentsOp when calculating the gradient with …
nateanl Jul 20, 2018
d368886
Fixed a missing '=' in LPPoolNd repr function (#9629)
vmirly Jul 20, 2018
bae156a
Support (some) CUDA Lapack on n-dimensional empty tensors.
gchanan Jul 20, 2018
8a0fe0a
set_input_record() should always add external input (#9636)
kittipatv Jul 20, 2018
1003ccf
Creates CUDAContext (#9435)
mruberry Jul 20, 2018
0158103
Add workspace.RunPlanInBackground (#9637)
kittipatv Jul 20, 2018
45e5c17
ONNXIFI transform (#9569)
Jul 20, 2018
3efdece
Support n-dimensional empty tensors in take/put.
gchanan Jul 20, 2018
5e84403
Fix for half conversion for ROCm 1.8.2 (#9663)
Jul 21, 2018
23ed26a
Guard include of cuda-only header comm.h (#9656)
ezyang Jul 21, 2018
b5c8d59
Add a CUDAContext header include
ezyang Jul 21, 2018
3bb8c5e
Allow MKLDNN on macOS, and any other OS where CMake is able to detect…
Jul 21, 2018
a01d6f0
Update channel_shuffle_op and transpose 2d to speed up ShuffleNet (#9…
xiaomengy Jul 21, 2018
769cb5a
Add new ways of matching nodes with schemas in the JIT (#9567)
apaszke Jul 22, 2018
f3d72b2
Modify barrier net to allow better control over its initialization an…
ygdx Jul 22, 2018
1afdc57
Hide all other fields in THTensor (#9683)
ezyang Jul 22, 2018
9ee5133
Fix dataloader hang when it is not completely iterated (#9655)
ssnl Jul 23, 2018
53083b8
Remove CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS and fix CUDA 8 build on Windo…
peterjc123 Jul 23, 2018
8a3634d
Merge remote-tracking branch 'upstream/master'
iotamudelta Jul 23, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ torch/csrc/nn/THNN.cpp
torch/csrc/nn/THNN.cwrap
torch/lib/*.a*
torch/lib/*.dll*
torch/lib/*.exe*
torch/lib/*.dylib*
torch/lib/*.h
torch/lib/*.lib
Expand Down
9 changes: 6 additions & 3 deletions .jenkins/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,13 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
fi

export ATEN_DISABLE_AVX=
export ATEN_DISABLE_AVX2=
if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then
export ATEN_CPU_CAPABILITY=default
elif [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
export ATEN_CPU_CAPABILITY=avx
export ATEN_DISABLE_AVX=1
fi
if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
export ATEN_DISABLE_AVX2=1
fi

test_python_nn() {
Expand Down
4 changes: 0 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,6 @@ endif()
# ---[ CMake scripts + modules
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)

if (MSVC AND ${BUILD_SHARED_LIBS})
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()

# ---[ CMake build directories
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/Backtrace.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <ATen/optional.h>
#include <ATen/Backtrace.h>

#include <functional>
#include <memory>
Expand Down
6 changes: 4 additions & 2 deletions aten/src/ATen/Backtrace.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include <string>
#include <typeinfo>

#include <ATen/ATenGeneral.h>

namespace at {
/// Utility to demangle a C++ symbol name.
std::string demangle(const char* name);
AT_API std::string demangle(const char* name);

/// Returns the printable name of the type.
template <typename T>
Expand All @@ -19,7 +21,7 @@ inline const char* demangle_type() {
#endif // __GXX_RTTI
}

std::string get_backtrace(
AT_API std::string get_backtrace(
size_t frames_to_skip = 0,
size_t maximum_number_of_frames = 64,
bool skip_python_frames = true);
Expand Down
110 changes: 0 additions & 110 deletions aten/src/ATen/CUDAGuard.h
Original file line number Diff line number Diff line change
@@ -1,110 +0,0 @@
#pragma once

#include <ATen/ArrayRef.h>
#include <ATen/CUDAStream.h>
#include <ATen/Context.h>
#include <ATen/DeviceGuard.h>

#include <cstddef>
#include <vector>

namespace at {

/// A variant of `DeviceGuard` that augments it with an understanding of CUDA
/// streams. This guard can not only set and reset the current CUDA device, but
/// also set and reset the current CUDA stream. It is important to note that
/// because a CUDA stream is intrinsically associated with the CUDA device to
/// which it is bound, setting the CUDA stream *also* sets the current CUDA
/// device to that of the stream.
struct CUDAGuard {
/// Default constructor, does nothing and causes no change in the current
/// stream or device until `set_stream` or `set_device` is called.
CUDAGuard() = default;

/// Sets the CUDA stream and its associated device as the current one (calls
/// `set_stream`).
explicit CUDAGuard(const CUDAStream& stream) {
set_stream(stream);
}

/// Calls `set_device` with the given index.
explicit CUDAGuard(int32_t device) {
set_device(device);
}

CUDAGuard(const CUDAGuard&) = delete;
CUDAGuard& operator=(const CUDAGuard&) = delete;

/// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The
/// moved-from `CUDAGuard` is modified such that its destruction has no
/// effect (does not reset the stream or device).
CUDAGuard(CUDAGuard&& other) noexcept = default;

/// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The
/// moved-from `CUDAGuard` is modified such that its destruction has no
/// effect (does not reset the stream or device).
CUDAGuard& operator=(CUDAGuard&& other) {
device_guard_ = std::move(other.device_guard_);
original_streams_ = std::move(other.original_streams_);
other.original_streams_.clear();
return *this;
}

/// Resets the CUDA stream on each device to the one that was active upon
/// construction.
~CUDAGuard() {
if (!original_streams_.empty()) {
for (size_t device = 0; device < original_streams_.size(); ++device) {
globalContext().uncheckedSetCurrentCUDAStreamOnDevice(
device, original_streams_[device]);
}
}
}

/// Sets the current CUDA device to the device associated with the given
/// stream, and then sets the current stream on that device to the one given.
void set_stream(const CUDAStream& stream) {
device_guard_.set_index(stream.device());
// If we haven't stored the current stream yet, store it now.
if (original_streams_.empty()) {
const size_t device_count = globalContext().getNumGPUs();
original_streams_.reserve(device_count);
for (size_t device = 0; device < device_count; ++device) {
original_streams_.push_back(
globalContext().getCurrentCUDAStreamOnDevice(device));
}
}
globalContext().setCurrentCUDAStreamOnDevice(
device_guard_.last_index(), stream);
}

/// Sets the CUDA device to the given one.
void set_device(int32_t device) {
device_guard_.set_index(device);
}

/// Returns the CUDA streams that were active in the first call to
/// `set_stream`. If there was no such call, the returned container is
/// empty.
ArrayRef<CUDAStream> original_streams() const noexcept {
return original_streams_;
}

/// Returns the device that was set upon construction of the guard.
int32_t original_device() const noexcept {
return device_guard_.original_index();
}

/// Returns the last device that was set via `set_device`, if any.
int32_t last_device() const noexcept {
return device_guard_.last_index();
}

private:
/// The guard for the current device.
DeviceGuard device_guard_;
/// The original streams that were active on all devices.
std::vector<CUDAStream> original_streams_;
};

} // namespace at
197 changes: 0 additions & 197 deletions aten/src/ATen/CUDAStream.cpp
Original file line number Diff line number Diff line change
@@ -1,197 +0,0 @@
#include "ATen/CUDAStream.h"
#include "ATen/Error.h"
#include "ATen/detail/CUDAHooksInterface.h"

#include <mutex>

// Internal implementation is entirely hidden
struct CUDAStreamInternals {
bool is_destructible;
std::atomic<int> refcount;
int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t
cudaStream_t stream;
};

namespace at {

namespace detail {

/*
* Stream state
*/
static constexpr cudaStream_t DEFAULT_STREAM = 0;

static std::once_flag init_flag;
static int64_t num_gpus;
static CUDAStreamInternals* default_streams;
static thread_local CUDAStreamInternals** current_streams = nullptr;

// Creates a(n indestructible) default stream for each device
// Note: the default stream on each device is signified by a zero
// value for the pointer, and so is not actually created as usual.
// In particular, we don't need to switch devices when creating the
// streams.
static void initDefaultCUDAStreams() {
num_gpus = getCUDAHooks().getNumGPUs();
default_streams = (CUDAStreamInternals*) malloc(num_gpus * sizeof(CUDAStreamInternals));
for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
default_streams[i].is_destructible = false;
default_streams[i].refcount = 0;
default_streams[i].device = i;
default_streams[i].stream = DEFAULT_STREAM;
}
}

// Init front-end to ensure initialization only occurs once
static void initCUDAStreamsOnce() {
// Inits default streams (once, globally)
std::call_once(init_flag, initDefaultCUDAStreams);

// Inits current streams (thread local) to default streams
if (current_streams) return;
current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*));
for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
current_streams[i] = &default_streams[i];
}
}

/*
* Pointer-based stream API
*/

// Helper to return the current device
static inline int64_t current_device() {
int cur_device;
DynamicCUDAInterface::get_device(&cur_device);
return cur_device;
}

// Helper to verify the GPU index is valid
static inline void check_gpu(int64_t device) {
AT_ASSERT(device >= 0 && device < num_gpus);
}

CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) {
initCUDAStreamsOnce();
check_gpu(device);
return &default_streams[device];
}
CUDAStreamInternals* CUDAStream_getDefaultStream() {
return CUDAStream_getDefaultStreamOnDevice(current_device());
}

// Creates (and retains) and new cuda stream
CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority) {
CUDAStreamInternals* internals = (CUDAStreamInternals*) malloc(sizeof(CUDAStreamInternals));
internals->is_destructible = true;
internals->refcount = 1;
internals->device = current_device();
DynamicCUDAInterface::cuda_stream_create_with_priority(&internals->stream, flags, priority);
return internals;
}

// Note: despite not being "unsafe," is using these methods in a multithreaded
// environment then the caller must be sure that streams are valid
// when they're requested. These methods will throw an error if an
// invalid stream is requested.
CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) {
initCUDAStreamsOnce();
check_gpu(device);
auto cur = current_streams[device];
AT_ASSERT(CUDAStream_retain(cur));
return cur;
}
CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() {
return CUDAStream_getAndRetainCurrentStreamOnDevice(current_device());
}

// Note: these unsafe methods do not retain the stream before returning it.
// This is unsafe behavior and these methods SHOULD NOT BE USED.
// They are here only for legacy compatibility.
CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device) {
initCUDAStreamsOnce();
check_gpu(device);
return current_streams[device];
}
CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe() {
return CUDAStream_getCurrentStreamOnDeviceUnsafe(current_device());
}

void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
initCUDAStreamsOnce();
check_gpu(device);
AT_ASSERT(ptr);
AT_ASSERT(ptr->device == device);
AT_ASSERT(CUDAStream_retain(ptr));

CUDAStream_free(current_streams[device]);
current_streams[device] = ptr;
}

void CUDAStream_uncheckedSetStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
initCUDAStreamsOnce();
CUDAStream_uncheckedFree(current_streams[device]);
current_streams[device] = ptr;
}

void CUDAStream_setStream(CUDAStreamInternals* ptr) {
CUDAStream_setStreamOnDevice(current_device(), ptr);
}

// Getters
cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) {
AT_ASSERT(ptr);
return ptr->stream;
}

int64_t CUDAStream_device(CUDAStreamInternals* ptr) {
AT_ASSERT(ptr);
return ptr->device;
}

// Memory management
// Note: only destructible (non-default) streams are ref counted
bool CUDAStream_retain(CUDAStreamInternals* ptr) {
AT_ASSERT(ptr);
if (ptr->is_destructible) return(++ptr->refcount > 1);
return true;
}

void CUDAStream_free(CUDAStreamInternals*& ptr) {
if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
AT_ASSERT(ptr->refcount == 0);
DynamicCUDAInterface::cuda_stream_destroy(ptr->stream);
free(ptr);
ptr = nullptr;
}
}
void CUDAStream_uncheckedFree(CUDAStreamInternals*& ptr) {
if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
DynamicCUDAInterface::unchecked_cuda_stream_destroy(ptr->stream);
free(ptr);
ptr = nullptr;
}
}

} // namespace detail

/*
* CUDAStream functions
*/

// Copy constructor
CUDAStream::CUDAStream(const CUDAStream& other) {
AT_ASSERT(other.internals_);
AT_ASSERT(detail::CUDAStream_retain(other.internals_));

internals_ = other.internals_;
}

// Move constructor
CUDAStream::CUDAStream(CUDAStream&& other) {
AT_ASSERT(other.internals_);

std::swap(internals_, other.internals_);
}

} // namespace at
Loading