<a href="https://colab.research.google.com/github/Ranjani94/Deep_Learning/blob/master/Ungraded_Assignment_4/pytorch3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Chapter 3:

1. Representing different types of real-world data as
PyTorch tensors
2. Working with range of data types, including
spreadsheet, time series, text, image, and medical
imaging
3. Loading data from file
4. Converting data to tensors
5. Shaping tensors so that they can be used as inputs for
neural network models


- Tabular Data
- Time Series
- Text
- Text Embeddings
- Images
- Volumetric Data

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'Deep_Learning/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, threshold=50)

In [3]:
import imageio

img_arr = imageio.imread('/content/gdrive/My Drive/Deep_Learning/flower.jpg')
img_arr.shape

(533, 800, 3)

In [0]:
img = torch.from_numpy(img_arr)
out = img.permute(2, 0, 1)

In [0]:
batch_size = 3
batch = torch.zeros(batch_size, 3, 100, 100, dtype=torch.uint8)

In [0]:
import os

data_dir = '/content/gdrive/My Drive/Deep_Learning/strawberry'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.jpg']
for i, filename in enumerate(filenames):
    img_arr = imageio.imread(os.path.join(data_dir, filename))
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2, 0, 1)
    img_t = img_t[:3] # <1>
    # batch[i] = img_t

In [0]:
batch = batch.float()
batch /= 255.0

In [0]:
n_channels = batch.shape[1]
for c in range(n_channels):
    mean = torch.mean(batch[:, c])
    std = torch.std(batch[:, c])
    batch[:, c] = (batch[:, c] - mean) / std

###Volumetric Data

In [0]:
import imageio.   #Takes so much time to process the DICOM image file
#A DCM file is an image file saved in the Digital Imaging and Communications in Medicine (DICOM) image format.
dir_path = "/content/gdrive/My Drive/Deep_Learning/000001.dcm"
vol_arr = imageio.volread(dir_path, 'DICOM')
vol_arr.shape

In [0]:

vol = torch.from_numpy(vol_arr).float()
vol = torch.transpose(vol, 0, 2)
vol = torch.unsqueeze(vol, 0)

vol.shape

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.imshow(vol_arr[50])

###Tabular Data

In [22]:

import csv
wine_path = "/content/gdrive/My Drive/Deep_Learning/wine.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.9 ,  0.33,  0.21, ...,  0.45,  9.4 ,  6.  ],
       [ 7.7 ,  0.29,  0.48, ...,  0.64, 10.6 ,  6.  ],
       [ 7.1 ,  0.39,  0.35, ...,  0.29, 11.6 ,  5.  ]], dtype=float32)

In [23]:

col_list = next(csv.reader(open(wine_path), delimiter=';'))

wineq_numpy.shape, col_list

((370, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [24]:
wineq = torch.from_numpy(wineq_numpy)

wineq.shape, wineq.dtype

(torch.Size([370, 12]), torch.float32)

In [25]:
data = wineq[:, :-1] # <1>
data, data.shape

(tensor([[ 7.0000,  0.2700,  ...,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  ...,  0.4900,  9.5000],
         ...,
         [ 7.7000,  0.2900,  ...,  0.6400, 10.6000],
         [ 7.1000,  0.3900,  ...,  0.2900, 11.6000]]), torch.Size([370, 11]))

In [26]:
target = wineq[:, -1] # <2>
target, target.shape


(tensor([6., 6.,  ..., 6., 5.]), torch.Size([370]))

In [27]:
target = wineq[:, -1].long()
target

tensor([6, 6,  ..., 6, 5])

In [28]:
target_onehot = torch.zeros(target.shape[0], 10)

target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.]])

In [29]:
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed

tensor([[6],
        [6],
        ...,
        [6],
        [5]])

In [30]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.8405e+00, 2.8796e-01, 3.4305e-01, 6.9322e+00, 4.9127e-02, 3.7580e+01,
        1.4736e+02, 9.9460e-01, 3.2017e+00, 4.7841e-01, 1.0073e+01])

In [31]:
data_var = torch.var(data, dim=0)
data_var

tensor([5.8177e-01, 1.1074e-02, 1.4929e-02, 2.8180e+01, 4.3050e-04, 2.7362e+02,
        1.9307e+03, 7.3165e-06, 2.1931e-02, 1.0675e-02, 1.1336e+00])

In [32]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 0.2091, -0.1707,  ..., -0.2749, -1.1956],
        [-0.7087,  0.1144,  ...,  0.1122, -0.5382],
        ...,
        [ 1.1268,  0.0194,  ...,  1.5640,  0.4950],
        [ 0.3402,  0.9697,  ..., -1.8235,  1.4342]])

In [33]:

bad_indexes = target <= 3 # <1>
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([370]), torch.bool, tensor(3))

In [34]:

bad_data = data[bad_indexes]
bad_data.shape

torch.Size([3, 11])

In [35]:

bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)] # <1>
good_data = data[target >= 7]

bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.80   6.89   6.58
 1 volatile acidity       0.36   0.29   0.28
 2 citric acid            0.34   0.34   0.34
 3 residual sugar         7.10   7.62   3.81
 4 chlorides              0.06   0.05   0.04
 5 free sulfur dioxide   26.67  38.72  32.89
 6 total sulfur dioxide 162.67 152.48 123.29
 7 density                1.00   1.00   0.99
 8 pH                     3.26   3.19   3.27
 9 sulphates              0.44   0.47   0.50
10 alcohol               10.00   9.80  11.31


In [36]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:,6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)

predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()


(torch.Size([370]), torch.bool, tensor(160))

In [37]:

actual_indexes = target > 5

actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([370]), torch.bool, tensor(230))

In [38]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches / n_predicted, n_matches / n_actual

(109, 0.68125, 0.47391304347826085)

###Time_Series

In [82]:

bikes_numpy = np.loadtxt("/content/gdrive/My Drive/Deep_Learning/hour1.csv", 
                         dtype=np.float32, 
                         delimiter=",", 
                         skiprows=1, 
                         converters={1: lambda x: float(x[8:10])}) # <1>
bikes = torch.from_numpy(bikes_numpy)
bikes

tensor([[1.0000e+00, 1.0000e+00,  ..., 1.3000e+01, 1.6000e+01],
        [2.0000e+00, 1.0000e+00,  ..., 3.2000e+01, 4.0000e+01],
        ...,
        [1.7378e+04, 3.1000e+01,  ..., 4.8000e+01, 6.1000e+01],
        [1.7379e+04, 3.1000e+01,  ..., 3.7000e+01, 4.9000e+01]])

In [83]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [85]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()


(torch.Size([730, 24, 17]), (408, 17, 1))

In [86]:
daily_bikes = daily_bikes.transpose(1, 2)
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))

In [87]:
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:,9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [88]:

weather_onehot.scatter_(
    dim=1, 
    index=first_day[:,9].unsqueeze(1).long() - 1, # <1>
    value=1.0)


tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [89]:

torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [90]:

daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2])
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [91]:
daily_weather_onehot.scatter_(1, daily_bikes[:,9,:].long().unsqueeze(1) - 1, 1.0)
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [0]:
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1)

In [0]:

daily_bikes[:, 9, :] = (daily_bikes[:, 9, :] - 1.0) / 3.0

In [0]:

temp = daily_bikes[:, 10, :]
temp_min = torch.min(temp)
temp_max = torch.max(temp)
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - temp_min) / (temp_max - temp_min)

In [0]:

temp = daily_bikes[:, 10, :]
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - torch.mean(temp)) / torch.std(temp)

###Text

In [0]:
with open('/content/gdrive/My Drive/MLSpring2020/the_expendables_AirQuality_Traffic/LDAData/Obama_2015.txt', encoding='utf8') as f:
    text = f.read()

In [72]:

lines = text.split('\n')
line = lines[200]
line


'like lower mortgage premiums and a higher minimum wagethese ideas will'

In [73]:
letter_t = torch.zeros(len(line), 128) # <1> 
letter_t.shape

torch.Size([70, 128])

In [0]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0  # <1>
    letter_t[i][letter_index] = 1

In [75]:
def clean_words(input_str):
    punctuation = '.,;:"!?”“_-'
    word_list = input_str.lower().replace('\n',' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
line, words_in_line

('like lower mortgage premiums and a higher minimum wagethese ideas will',
 ['like',
  'lower',
  'mortgage',
  'premiums',
  'and',
  'a',
  'higher',
  'minimum',
  'wagethese',
  'ideas',
  'will'])

In [0]:

word_list = sorted(set(clean_words(text)))
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

# len(word2index_dict), word2index_dict['impossible']

In [77]:

word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))
    
print(word_t.shape)

 0  855 like
 1  878 lower
 2  957 mortgage
 3 1135 premiums
 4  105 and
 5   34 a
 6  706 higher
 7  936 minimum
 8 1632 wagethese
 9  734 ideas
10 1678 will
torch.Size([11, 1721])


In [78]:

word_t = word_t.unsqueeze(1)
word_t.shape

torch.Size([11, 1, 1721])

In [79]:
[(c, ord(c)) for c in sorted(set(text))]


[('\n', 10),
 (' ', 32),
 ('"', 34),
 ('$', 36),
 ("'", 39),
 (',', 44),
 ('-', 45),
 ('.', 46),
 ('/', 47),
 ('0', 48),
 ('1', 49),
 ('2', 50),
 ('3', 51),
 ('4', 52),
 ('5', 53),
 ('6', 54),
 ('7', 55),
 ('8', 56),
 ('9', 57),
 (':', 58),
 (';', 59),
 ('?', 63),
 ('A', 65),
 ('B', 66),
 ('C', 67),
 ('D', 68),
 ('E', 69),
 ('F', 70),
 ('G', 71),
 ('H', 72),
 ('I', 73),
 ('J', 74),
 ('K', 75),
 ('L', 76),
 ('M', 77),
 ('N', 78),
 ('O', 79),
 ('P', 80),
 ('R', 82),
 ('S', 83),
 ('T', 84),
 ('U', 85),
 ('V', 86),
 ('W', 87),
 ('Y', 89),
 ('a', 97),
 ('b', 98),
 ('c', 99),
 ('d', 100),
 ('e', 101),
 ('f', 102),
 ('g', 103),
 ('h', 104),
 ('i', 105),
 ('j', 106),
 ('k', 107),
 ('l', 108),
 ('m', 109),
 ('n', 110),
 ('o', 111),
 ('p', 112),
 ('q', 113),
 ('r', 114),
 ('s', 115),
 ('t', 116),
 ('u', 117),
 ('v', 118),
 ('w', 119),
 ('x', 120),
 ('y', 121),
 ('z', 122)]

In [80]:

ord('l'
   )

108