In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import math
from torchsummary import summary
import numpy as np
import copy
from numpy import newaxis
from collections import defaultdict
from random import choices
import multiprocessing as mp

In [2]:
class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.fc1 = nn.Linear(28 * 28, 1000)
            self.fc2 = nn.Linear(1000, 10)
            

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.fc2(x)
            return F.log_softmax(x)

trained_nn=Net()
trained_nn.load_state_dict(torch.load("Trained_MNIST_model"))

In [3]:
print(trained_nn)
for param_tensor in trained_nn.state_dict():
    print(param_tensor, "\t", trained_nn.state_dict()[param_tensor].size())
#     print(param_tensor, "\t", len(trained_nn.state_dict()[param_tensor]))
    print(param_tensor, "\t", trained_nn.state_dict()[param_tensor])
# print(len(trained_nn.state_dict())//2+1)

Net(
  (fc1): Linear(in_features=784, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=10, bias=True)
)
fc1.weight 	 torch.Size([1000, 784])
fc1.weight 	 tensor([[ 0.0008, -0.0187,  0.0208,  ..., -0.0181,  0.0072,  0.0311],
        [ 0.0129, -0.0191, -0.0032,  ..., -0.0051, -0.0218,  0.0027],
        [ 0.0324,  0.0261,  0.0021,  ...,  0.0331, -0.0297, -0.0028],
        ...,
        [-0.0181, -0.0195, -0.0178,  ...,  0.0118, -0.0163, -0.0325],
        [ 0.0254,  0.0184,  0.0181,  ...,  0.0081,  0.0158,  0.0264],
        [-0.0274, -0.0291, -0.0372,  ..., -0.0074, -0.0170, -0.0284]])
fc1.bias 	 torch.Size([1000])
fc1.bias 	 tensor([-1.6859e-02,  1.3460e-02, -7.7439e-05, -1.3749e-02,  2.0384e-02,
        -7.0187e-03, -2.0665e-02, -8.3884e-04, -3.3272e-02,  3.1202e-02,
        -5.3926e-03,  1.7053e-02, -8.7766e-03, -6.1230e-03, -3.0541e-02,
        -1.5238e-02,  1.1690e-02, -1.2237e-02, -2.2264e-02, -1.4238e-02,
         2.8548e-02,  1.5772e-02, -3.2383e-02, -2.9

In [4]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device=torch.device("cpu")
print(device)

cpu


In [5]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_nn.to(device)
summary(trained_nn, input_size=(1, 28* 28), device="cpu")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1              [-1, 1, 1000]         785,000
            Linear-2                [-1, 1, 10]          10,010
Total params: 795,010
Trainable params: 795,010
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 3.03
Estimated Total Size (MB): 3.04
----------------------------------------------------------------


  # This is added back by InteractiveShellApp.init_path()


In [6]:
#Fetching the entire data and taking a subset of it
#Getting all the batches in a single list
batch_size=200
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=batch_size, shuffle=True)

In [7]:
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])),
        batch_size=batch_size, shuffle=True)



# create a loss function
criterion = nn.NLLLoss()
test_loss = 0
correct = 0
i=0
for data, target in test_loader:
#         print(data.shape, " ",target.shape)
    data, target = Variable(data, volatile=True), Variable(target)
    data = data.view(-1, 28 * 28)
#         i+=1
#     print(i)
#         print(data.shape, " ",target.shape)
    net_out = trained_nn(data)
    # sum up batch loss
    test_loss += criterion(net_out, target).data[0]
    pred = net_out.data.max(1)[1]  # get the index of the max log-probability
    correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nBefore Sparsification Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))

  # This is added back by InteractiveShellApp.init_path()



Before Sparsification Test set: Average loss: 0.0003, Accuracy: 9800/10000 (98%)



In [8]:
def calNonZero(trained_nn, Layers):
    nzeros=0
    for l in Layers:
        for i in range(trained_nn.state_dict()[l].shape[0]):
            for j in range(trained_nn.state_dict()[l].shape[1]):
                if(trained_nn.state_dict()[l][i][j].item()!=0.0):
                    nzeros+=1

    return nzeros

In [9]:
Layers=["fc1.weight","fc2.weight"]
print("Number of non zero entries={}".format(calNonZero(trained_nn, Layers)))

Number of non zero entries=794000


In [10]:
print((trained_nn.state_dict()[Layers[0]][0][0]))

tensor(0.0008)


In [12]:
trained_nn=Net()
trained_nn.load_state_dict(torch.load("Trained_MNIST_model"))
Layers=["fc1.weight","fc2.weight"]

import time
start_time=time.time()

for l in range(len(Layers)): # l will point to one layer at a time
    prev_layer_nodes_count=len(trained_nn.state_dict()[Layers[l]][0])
    
    
    
    #****************************************************************************************************************
    #****************************************************************************************************************
    #****************************************************************************************************************
    #               Here in the below line we are choosing what percentage of nonzero weights we will keep
    #we are choosing the percentage of nnz() to be kept here in the below line
    
    percent_to_keep = 0.1
    m=math.ceil(percent_to_keep*(prev_layer_nodes_count))
                     
                     
    for i in range(len(trained_nn.state_dict()[Layers[l]])): # i will point to one node of the current layer at a time
        print("Sparsifying layer {}, node {}".format(l+1, i))
        w_small=trained_nn.state_dict()[Layers[l]][i]
        w_small_cap=torch.zeros([prev_layer_nodes_count], dtype=torch.float32)
        
        
        
        #****************************************************************************************************************
        #****************************************************************************************************************
        #****************************************************************************************************************
        #               Here in the below line we are choosing with replacement or without
    
        C=np.random.choice(prev_layer_nodes_count, m, replace=True)
        
        
        q_j=1/prev_layer_nodes_count
        
        for j in C:
            w_small_cap[j]+=(w_small[j])/(m*q_j)
            
        trained_nn.state_dict()[Layers[l]][i]=w_small_cap
        
        
end_time=time.time()
print("Total time taken={}".format(end_time-start_time))    


Sparsifying layer 1, node 0
Sparsifying layer 1, node 1
Sparsifying layer 1, node 2
Sparsifying layer 1, node 3
Sparsifying layer 1, node 4
Sparsifying layer 1, node 5
Sparsifying layer 1, node 6
Sparsifying layer 1, node 7
Sparsifying layer 1, node 8
Sparsifying layer 1, node 9
Sparsifying layer 1, node 10
Sparsifying layer 1, node 11
Sparsifying layer 1, node 12
Sparsifying layer 1, node 13
Sparsifying layer 1, node 14
Sparsifying layer 1, node 15
Sparsifying layer 1, node 16
Sparsifying layer 1, node 17
Sparsifying layer 1, node 18
Sparsifying layer 1, node 19
Sparsifying layer 1, node 20
Sparsifying layer 1, node 21
Sparsifying layer 1, node 22
Sparsifying layer 1, node 23
Sparsifying layer 1, node 24
Sparsifying layer 1, node 25
Sparsifying layer 1, node 26
Sparsifying layer 1, node 27
Sparsifying layer 1, node 28
Sparsifying layer 1, node 29
Sparsifying layer 1, node 30
Sparsifying layer 1, node 31
Sparsifying layer 1, node 32
Sparsifying layer 1, node 33
Sparsifying layer 1, nod

Sparsifying layer 1, node 329
Sparsifying layer 1, node 330
Sparsifying layer 1, node 331
Sparsifying layer 1, node 332
Sparsifying layer 1, node 333
Sparsifying layer 1, node 334
Sparsifying layer 1, node 335
Sparsifying layer 1, node 336
Sparsifying layer 1, node 337
Sparsifying layer 1, node 338
Sparsifying layer 1, node 339
Sparsifying layer 1, node 340
Sparsifying layer 1, node 341
Sparsifying layer 1, node 342
Sparsifying layer 1, node 343
Sparsifying layer 1, node 344
Sparsifying layer 1, node 345
Sparsifying layer 1, node 346
Sparsifying layer 1, node 347
Sparsifying layer 1, node 348
Sparsifying layer 1, node 349
Sparsifying layer 1, node 350
Sparsifying layer 1, node 351
Sparsifying layer 1, node 352
Sparsifying layer 1, node 353
Sparsifying layer 1, node 354
Sparsifying layer 1, node 355
Sparsifying layer 1, node 356
Sparsifying layer 1, node 357
Sparsifying layer 1, node 358
Sparsifying layer 1, node 359
Sparsifying layer 1, node 360
Sparsifying layer 1, node 361
Sparsifyin

Sparsifying layer 1, node 671
Sparsifying layer 1, node 672
Sparsifying layer 1, node 673
Sparsifying layer 1, node 674
Sparsifying layer 1, node 675
Sparsifying layer 1, node 676
Sparsifying layer 1, node 677
Sparsifying layer 1, node 678
Sparsifying layer 1, node 679
Sparsifying layer 1, node 680
Sparsifying layer 1, node 681
Sparsifying layer 1, node 682
Sparsifying layer 1, node 683
Sparsifying layer 1, node 684
Sparsifying layer 1, node 685
Sparsifying layer 1, node 686
Sparsifying layer 1, node 687
Sparsifying layer 1, node 688
Sparsifying layer 1, node 689
Sparsifying layer 1, node 690
Sparsifying layer 1, node 691
Sparsifying layer 1, node 692
Sparsifying layer 1, node 693
Sparsifying layer 1, node 694
Sparsifying layer 1, node 695
Sparsifying layer 1, node 696
Sparsifying layer 1, node 697
Sparsifying layer 1, node 698
Sparsifying layer 1, node 699
Sparsifying layer 1, node 700
Sparsifying layer 1, node 701
Sparsifying layer 1, node 702
Sparsifying layer 1, node 703
Sparsifyin

In [13]:
print(trained_nn)
for param_tensor in trained_nn.state_dict():
    print(param_tensor, "\t", trained_nn.state_dict()[param_tensor].size())
#     print(param_tensor, "\t", len(trained_nn.state_dict()[param_tensor]))
    print(param_tensor, "\t", trained_nn.state_dict()[param_tensor])
# print(len(trained_nn.state_dict())//2+1)

Net(
  (fc1): Linear(in_features=784, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=10, bias=True)
)
fc1.weight 	 torch.Size([1000, 784])
fc1.weight 	 tensor([[ 0.0082,  0.0000,  0.0000,  ...,  0.0000,  0.0713,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.2164,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000, -0.1935,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
fc1.bias 	 torch.Size([1000])
fc1.bias 	 tensor([-1.6859e-02,  1.3460e-02, -7.7439e-05, -1.3749e-02,  2.0384e-02,
        -7.0187e-03, -2.0665e-02, -8.3884e-04, -3.3272e-02,  3.1202e-02,
        -5.3926e-03,  1.7053e-02, -8.7766e-03, -6.1230e-03, -3.0541e-02,
        -1.5238e-02,  1.1690e-02, -1.2237e-02, -2.2264e-02, -1.4238e-02,
         2.8548e-02,  1.5772e-02, -3.2383e-02, -2.9

In [14]:
number_non_zero=calNonZero(trained_nn, Layers)
print("Number of non zero entries after saparsifying={}".format(number_non_zero))

Number of non zero entries after saparsifying=76174


In [15]:
print(number_non_zero/794000)

0.09593702770780857


In [16]:
summary(trained_nn, input_size=(1, 28* 28), device="cpu")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1              [-1, 1, 1000]         785,000
            Linear-2                [-1, 1, 10]          10,010
Total params: 795,010
Trainable params: 795,010
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 3.03
Estimated Total Size (MB): 3.04
----------------------------------------------------------------


  # This is added back by InteractiveShellApp.init_path()


In [17]:
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])),
        batch_size=batch_size, shuffle=True)



# create a loss function
criterion = nn.NLLLoss()
test_loss = 0
correct = 0
i=0
for data, target in test_loader:
#         print(data.shape, " ",target.shape)
    data, target = Variable(data, volatile=True), Variable(target)
    data = data.view(-1, 28 * 28)
#         i+=1
#     print(i)
#         print(data.shape, " ",target.shape)
    net_out = trained_nn(data)
    # sum up batch loss
    test_loss += criterion(net_out, target).data[0]
    pred = net_out.data.max(1)[1]  # get the index of the max log-probability
    correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nAfter sparsification Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))

  # This is added back by InteractiveShellApp.init_path()



After sparsification Test set: Average loss: 0.0615, Accuracy: 2495/10000 (24%)



Reference:

[1] https://github.com/adventuresinML/adventures-in-ml-code/blob/master/pytorch_nn.py [28.08.2019]

[2] https://towardsdatascience.com/model-summary-in-pytorch-b5a1e4b64d25 [02.09.2019]