In [1]:
def svm_loss_naive(W, X, y, reg):
    """
    Inputs have dimension D, there are C classes, and we operate on minibatches
    of N examples.

    Inputs:
    - W: A numpy array of shape (D, C) containing weights.
    - X: A numpy array of shape (N, D) containing a minibatch of data.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c means
      that X[i] has label c, where 0 <= c < C.
    - reg: (float) regularization strength

    Returns a tuple of:
    - loss as single float
    - gradient with respect to weights W; an array of same shape as W
    """
    dW = np.zeros(W.shape)  # initialize the gradient as zero
    # compute the loss and the gradient
    num_classes = W.shape[1]
    num_train = X.shape[0]
    loss = 0.0
    for i in xrange(num_train):
        scores = X[i].dot(W)
        correct_class_score = scores[y[i]]  # y[i]表示第i个样本的真实标签。score表示其得分
        for j in xrange(num_classes): #j表示
            if j == y[i]:
                continue     # 如果满足，则跳出该for循环。即公式里面，求和时j!=y_{i}
            margin = scores[j] - correct_class_score + 1  # note delta = 1
            if margin > 0:
                loss += margin
                dW[:, j] += X[i].T  #矩阵求导
                dW[:, y[i]] -= X[i].T

            # Right now the loss is a sum over all training examples, but we want it
            # to be an average instead so we divide by num_train.
    loss /= num_train
    dW /= num_train
    dW += reg * 2 * W

    # Add regularization to the loss.
    loss += reg * np.sum(W * W)  # L2 正则化项

    #############################################################################
    # TODO:                                                                     #
    # Compute the gradient of the loss function and store it dW.                #
    # Rather that first computing the loss and then computing the derivative,   #
    # it may be simpler to compute the derivative at the same time that the     #
    # loss is being computed. As a result you may need to modify some of the    #
    # code above to compute the gradient.                                       #
    #############################################################################
    return loss, dW

这个题目主要难点是 Loss 对 W 的偏导数要弄清楚怎么求, 然后就可以程序实现了。 

首先看损失函数公式（1）![image.png](attachment:image.png)

其中对于某一个样本Xi(shape=(1,D)),与权重W(shape=(D,C))(可以看出所有样本共享参数)，得到其损失函数为Li(shape=(1,C)),反向求导求梯度dW：
样本Xi分别与权重W中的每一列Wj相乘，得到对应的损失函数Lij，故dWj分别受Lij的影响。
对公式（1）求导：
当 j==yi 时： dWyi = -Xi
当 j!= yi 时：dWi = Xi
(dWj和一个样本xi包含的元素一样多，xi对应位置的分量给对应位置的dWj分量带来贡献）
所有样本共享参数，故所有的样本累计求一遍，然后再除以样本总数，并加上正则项，就可以得到我们要求的 dW。

In [None]:
# Compute the loss and its gradient at W.
loss, grad = svm_loss_naive(W, X_dev, y_dev, 0.0)

# Numerically compute the gradient along several randomly chosen dimensions, and
# compare them with your analytically computed gradient. The numbers should match
# almost exactly along all dimensions.
from cs231n.gradient_check import grad_check_sparse
f = lambda w: svm_loss_naive(w, X_dev, y_dev, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, grad)

这里涉及到lambda函数中嵌套函数
其中在grad_check_sparse函数中，W是函数f的参数，W带入svm_loss_naive函数中得到返回值[0]即loss，loss是关于W的函数，即可用差值法求梯度。

In [None]:
def svm_loss_vectorized(W, X, y, reg):
    """
    Structured SVM loss function, vectorized implementation.

    Inputs and outputs are the same as svm_loss_naive.
    """
    loss = 0.0
    num_train = X.shape[0]
    dW = np.zeros(W.shape)  # initialize the gradient as zero
    #############################################################################
    # TODO:                                                                     #
    # Implement a vectorized version of the structured SVM loss, storing the    #
    # result in loss.                                                           #
    #############################################################################
    scores = X.dot(W)
    correct_class_scores = scores[np.arange(num_train), y].reshape(num_train, 1)
    margins = scores - correct_class_scores + 1
    margins[margins < 0] = 0
    margins[np.arange(num_train), y] = 0
    loss = np.sum(margins)  # 所有的超过边界的值的和
    loss /= num_train
    loss += reg * np.sum(W ** 2)

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################


    #############################################################################
    # TODO:                                                                     #
    # Implement a vectorized version of the gradient for the structured SVM     #
    # loss, storing the result in dW.                                           #
    #                                                                           #
    # Hint: Instead of computing the gradient from scratch, it may be easier    #
    # to reuse some of the intermediate values that you used to compute the     #
    # loss.                                                                     #
    #############################################################################
    X_mask = np.zeros(margins.shape)
    X_mask[margins > 0] = 1
    incorrect_counts = np.sum(X_mask, axis=1)
    X_mask[np.arange(num_train), y] = -incorrect_counts
    dW = X.T.dot(X_mask)
    dW /= num_train
    dW += reg  * W
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    return loss, dW

这题里面有两点需要理解，ndarray[ndarray,list]中间两个参数可以这么用
correct_class_scores = scores[np.arange(num_train), y].reshape(num_train, 1)
举例如下：

In [10]:
import numpy as np
a = np.random.randint(1,9,(5,5))
print(a)
b = a[np.arange(5),[0,1,2,3,4]]
print(b)

[[1 5 3 8 4]
 [8 8 2 8 2]
 [6 1 2 1 2]
 [7 4 8 1 5]
 [5 5 3 4 1]]
[1 8 2 1 1]
