In [2]:
import csv
with open('winequality-red.csv', 'r') as f:
    wines = list(csv.reader(f, delimiter=';'))
print(wines[:3])

[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'], ['7.4', '0.7', '0', '1.9', '0.076', '11', '34', '0.9978', '3.51', '0.56', '9.4', '5'], ['7.8', '0.88', '0', '2.6', '0.098', '25', '67', '0.9968', '3.2', '0.68', '9.8', '5']]


In [3]:
type(wines)

list

In [5]:
wines[0]

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

#### Find the average quality of the wines.

In [7]:
qualities = []
for item in wines[1:]:
    qualities.append(float(item[-1]))


In [10]:
avg_quality = sum(qualities)/len(qualities)
print(avg_quality)

5.6360225140712945


This can be much easier using Numpy.

In [12]:
import csv
with open("winequality-red.csv", 'r') as f:
    wines = list(csv.reader(f, delimiter=";"))
import numpy as np
wines = np.array(wines[1:], dtype=np.float)

In [13]:
wines.shape

(1599, 12)

In [14]:
wines.ndim

2

In [16]:
wines.size

19188

In [17]:
wines

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

#### Numpy Array Creation

In [18]:
np.random.rand(3,4)


array([[0.54984501, 0.23291677, 0.00959423, 0.86202672],
       [0.58828919, 0.30106963, 0.7965291 , 0.41652073],
       [0.79292345, 0.28893925, 0.51211589, 0.14091082]])

In [23]:
np.ones((2,3))

array([[1., 1., 1.],
       [1., 1., 1.]])

In [25]:
np.zeros((3,4)) #empty_array

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

#### Using NumPy To Read In Files

In [26]:
wines = np.genfromtxt("winequality-red.csv", delimiter=";", skip_header=1)

In [27]:
wines

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

Let’s select the element at row 3 and column 4.

In [33]:
wines[2,3]

2.3

### Slicing NumPy Arrays

In [34]:
wines[0:3,3]

array([1.9, 2.6, 2.3])

In [35]:
wines[:3,3]

array([1.9, 2.6, 2.3])

We can select an entire column by specifying that we want all the elements, from the first to the last. We specify this by just using the colon (:), with no starting or ending indices. The below code will select the entire fourth column:

In [36]:
wines[:,3]

array([1.9, 2.6, 2.3, ..., 2.3, 2. , 3.6])

We selected an entire column above, but we can also extract an entire row:

In [37]:
wines[3,:]

array([11.2  ,  0.28 ,  0.56 ,  1.9  ,  0.075, 17.   , 60.   ,  0.998,
        3.16 ,  0.58 ,  9.8  ,  6.   ])

If we take our indexing to the extreme, we can select the entire array using two colons to select all the rows and columns in wines. This is a great party trick, but doesn’t have a lot of good applications:

In [38]:
wines[:,:]

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

### Assigning Values To NumPy Arrays

We can also use indexing to assign values to certain elements in arrays. We can do this by assigning directly to the indexed value:

In [42]:
wines[1,5] = 10

In [43]:
wines[1,5]

10.0

In [40]:
wines[:,10] = 50

The above code overwrites all the values in the eleventh column with 50.

In [41]:
wines[:,10]

array([50., 50., 50., ..., 50., 50., 50.])

### 1-Dimensional NumPy Arrays

In [44]:
third_wine = wines[3,:]

In [45]:
third_wine

array([11.2  ,  0.28 ,  0.56 ,  1.9  ,  0.075, 17.   , 60.   ,  0.998,
        3.16 ,  0.58 , 50.   ,  6.   ])

In [46]:
third_wine[1]

0.28

In [50]:
np.random.rand(3)

array([0.74961071, 0.2817218 , 0.68665331])

### N-Dimensional NumPy Arrays

This doesn’t happen extremely often, but there are cases when you’ll want to deal with arrays that have greater than 3 dimensions. One way to think of this is as a list of lists of lists. Let’s say we want to store the monthly earnings of a store, but we want to be able to quickly lookup the results for a quarter, and for a year. The earnings for one year might look like this:

In [52]:
[500, 505, 490, 810, 450, 678, 234, 897, 430, 560, 1023, 640]

[500, 505, 490, 810, 450, 678, 234, 897, 430, 560, 1023, 640]

The store earned $500 in January, $505 in February, and so on. We can split up these earnings by quarter into a list of lists:

In [53]:
year_one = [
[500,505,490],
[810,450,678],
[234,897,430],
[560,1023,640]
]

We can retrieve the earnings from January by calling year_one[0][0]. If we want the results for a whole quarter, we can call year_one[0] or year_one[1]. We now have a 2-dimensional array, or matrix. But what if we now want to add the results from another year? We have to add a third dimension:

In [57]:
earning = [
[
[500,505,490],
[810,450,678],
[234,897,430],
[560,1023,640]
],
[
[600,605,490],
[345,900,1000],
[780,730,710],
[670,540,324]
]
]

We can retrieve the earnings from January of the first year by calling earnings[0][0][0]. We now need three indexes to retrieve a single element. A three-dimensional array in NumPy is much the same. In fact, we can convert earnings to an array and then get the earnings for January of the first year:

In [58]:
earnings = np.array(earning)
earnings[0,0,0]

500

In [59]:
earnings.shape

(2, 4, 3)

If we wanted to get first quarter earnings from both years, we could do this:m

In [61]:
earnings[:,0,:]

array([[500, 505, 490],
       [600, 605, 490]])

### NumPy Array Operations

#### Single Array Math

In [62]:
wines[:,11] + 10

array([15., 15., 15., ..., 16., 15., 16.])

In [63]:
wines[:,11] += 10
wines[:,11]

array([15., 15., 15., ..., 16., 15., 16.])

All the other operations work the same way. For example, if we want to multiply each of the quality score by 2, we could do it like this:

In [64]:
wines[:,11] * 2

array([30., 30., 30., ..., 32., 30., 32.])

#### Multiple Array Math

In [65]:
wines[:,11] + wines[:,11]

array([30., 30., 30., ..., 32., 30., 32.])

In [66]:
wines[:,10] * wines[:,11]

array([750., 750., 750., ..., 800., 750., 800.])

### Broadcasting

In [67]:
wines * np.array([1,2])

ValueError: operands could not be broadcast together with shapes (1599,12) (2,) 

The above example didn’t work because the two arrays don’t have a matching trailing dimension. Here’s an example where the last dimension does match:

In [68]:
array_one = np.array(
[
[1,2],
[3,4]
]
)
array_two = np.array([4,5])

In [69]:
array_one.shape

(2, 2)

In [70]:
array_two.shape

(2,)

In [71]:
array_one + array_two

array([[5, 7],
       [7, 9]])

Here’s an example with our wines data:

In [72]:
rand_array = np.random.rand(12)
wines + rand_array

array([[ 7.8887414 ,  1.0896762 ,  0.28900428, ...,  1.51740264,
        50.81922111, 15.69708324],
       [ 8.2887414 ,  1.2696762 ,  0.28900428, ...,  1.63740264,
        50.81922111, 15.69708324],
       [ 8.2887414 ,  1.1496762 ,  0.32900428, ...,  1.60740264,
        50.81922111, 15.69708324],
       ...,
       [ 6.7887414 ,  0.8996762 ,  0.41900428, ...,  1.70740264,
        50.81922111, 16.69708324],
       [ 6.3887414 ,  1.0346762 ,  0.40900428, ...,  1.66740264,
        50.81922111, 15.69708324],
       [ 6.4887414 ,  0.6996762 ,  0.75900428, ...,  1.61740264,
        50.81922111, 16.69708324]])

### NumPy Array Methods

In [73]:
wines[:,11].sum()

25002.0

In [74]:
wines.sum(axis=0)

array([13303.1    ,   843.985  ,   433.29   ,  4059.55   ,   139.859  ,
       25369.     , 74302.     ,  1593.79794,  5294.47   ,  1052.38   ,
       79950.     , 25002.     ])

In [75]:
wines.sum(axis=0).shape

(12,)

In [76]:
wines.sum(axis=1).shape

(1599,)

In [77]:
wines.sum(axis=1)

array([125.1438 , 158.2548 , 149.899  , ..., 149.48174, 155.01547,
       141.49249])

### Subsetting

In [78]:
high_quality = wines[:,11] > 7
wines[high_quality,:][:3,:]

array([[7.400e+00, 7.000e-01, 0.000e+00, 1.900e+00, 7.600e-02, 1.100e+01,
        3.400e+01, 9.978e-01, 3.510e+00, 5.600e-01, 5.000e+01, 1.500e+01],
       [7.800e+00, 8.800e-01, 0.000e+00, 2.600e+00, 9.800e-02, 1.000e+01,
        6.700e+01, 9.968e-01, 3.200e+00, 6.800e-01, 5.000e+01, 1.500e+01],
       [7.800e+00, 7.600e-01, 4.000e-02, 2.300e+00, 9.200e-02, 1.500e+01,
        5.400e+01, 9.970e-01, 3.260e+00, 6.500e-01, 5.000e+01, 1.500e+01]])

In [79]:
high_quality_and_alcohol = (wines[:,10] > 10) & (wines[:,11] > 7)
wines[high_quality_and_alcohol,10:]

array([[50., 15.],
       [50., 15.],
       [50., 15.],
       ...,
       [50., 16.],
       [50., 15.],
       [50., 16.]])

We can combine subsetting and assignment to overwrite certain values in an array:

In [80]:
high_quality_and_alcohol = (wines[:,10] > 10) & (wines[:,11] > 7)
wines[high_quality_and_alcohol,10:] = 20