#  NumPy Basics <hr style="border:2.5px solid #126782"></hr>

Data Analyst: Gyro A. Madrona<br>
Department: Electrical Engineering

Install and update NumPy

In [119]:
#%pip install numpy --upgrade

Install and update scipy

In [120]:
#%pip install scipy --upgrade

In [1]:
# imports and packages
import numpy as np
from scipy import stats

# 1D Array

In [2]:
# 1-dimensional array
array_a = np.array([1,2,3])
array_a

array([1, 2, 3])

In [4]:
# size of an array
np.shape(array_a)

(3,)

In [5]:
# 1-dimensional array
array_b = np.array([4,5,6])
array_b

array([4, 5, 6])

# 2D Array

In [6]:
# 2-dimensional array
my_array = np.array([[1,2,3],
                     [4,5,6]])
my_array

array([[1, 2, 3],
       [4, 5, 6]])

In [7]:
# size of matrix
np.shape(my_array)

(2, 3)

In [8]:
# transpose of a matrix
t_array = my_array.T
t_array

array([[1, 4],
       [2, 5],
       [3, 6]])

In [9]:
# size of matrix
np.shape(t_array)

(3, 2)

# Measures of Central Tendency

## Fruit Price List

In [35]:
# fruit price list dataset
fruits = np.array([120,60,85,150,200])
fruits

array([120,  60,  85, 150, 200])

In [11]:
# mean
fruits_mean = np.mean(fruits)
fruits_mean

np.float64(123.0)

In [12]:
# median
fruits_median = np.median(fruits)
fruits_median

np.float64(120.0)

In [13]:
# sort
fruits_sorted = np.sort(fruits)
fruits_sorted

array([ 60,  85, 120, 150, 200])

In [36]:
# mode
fruits_mode = stats.mode(fruits)
fruits_mode

ModeResult(mode=np.int64(60), count=np.int64(1))

In [37]:
if fruits_mode[1] == 1:
    print("No mode")
else:
    print(fruits_mode[0])

No mode


## Voltage Response

In [38]:
# voltage response dataset
voltage = np.array([
    [1,2,3,4,5,6,7,8],
    [12,5,9.1,3.3,24,18.5,15.2,np.nan],
    [2.8,4.5,6,9,11.7,14.8,17.3,20]
])
voltage

array([[ 1. ,  2. ,  3. ,  4. ,  5. ,  6. ,  7. ,  8. ],
       [12. ,  5. ,  9.1,  3.3, 24. , 18.5, 15.2,  nan],
       [ 2.8,  4.5,  6. ,  9. , 11.7, 14.8, 17.3, 20. ]])

In [40]:
# size
np.shape(voltage)

(3, 8)

In [42]:
# mean
voltage_mean = np.mean(voltage,axis=1)
voltage_mean

array([ 4.5   ,     nan, 10.7625])

In [43]:
# mean ignoring any NaN values
voltage_mean = np.nanmean(voltage,axis=1)
voltage_mean

array([ 4.5       , 12.44285714, 10.7625    ])

In [44]:
# transpose
voltage = voltage.T
voltage

array([[ 1. , 12. ,  2.8],
       [ 2. ,  5. ,  4.5],
       [ 3. ,  9.1,  6. ],
       [ 4. ,  3.3,  9. ],
       [ 5. , 24. , 11.7],
       [ 6. , 18.5, 14.8],
       [ 7. , 15.2, 17.3],
       [ 8. ,  nan, 20. ]])

In [45]:
# size
np.shape(voltage)

(8, 3)

In [48]:
# mean ignoring any NaN values
voltage_mean = np.nanmean(voltage,axis=0)
voltage_mean

array([ 4.5       , 12.44285714, 10.7625    ])

In [49]:
# median ignoring any NaN values
voltage_median = np.nanmedian(voltage,axis=0)
voltage_median

array([ 4.5 , 12.  , 10.35])

In [50]:
# sort
voltage_sorted = np.sort(voltage,axis=0)
voltage_sorted

array([[ 1. ,  3.3,  2.8],
       [ 2. ,  5. ,  4.5],
       [ 3. ,  9.1,  6. ],
       [ 4. , 12. ,  9. ],
       [ 5. , 15.2, 11.7],
       [ 6. , 18.5, 14.8],
       [ 7. , 24. , 17.3],
       [ 8. ,  nan, 20. ]])

In [51]:
# mode
voltage_mode = stats.mode(voltage,axis=0)
voltage_mode

ModeResult(mode=array([1. , 3.3, 2.8]), count=array([1., 1., 1.]))

# Measures of Variability

## Exam Performance

In [52]:
# exam performance dataset
grade = np.array([3.5,6.7,7,7.4,7.8,8.2,8.5,8.8,9,9.1,9.4,9.8])
grade

array([3.5, 6.7, 7. , 7.4, 7.8, 8.2, 8.5, 8.8, 9. , 9.1, 9.4, 9.8])

In [53]:
# maximum
grade_max = np.max(grade)
grade_max

np.float64(9.8)

In [54]:
# minimum
grade_min = np.min(grade)
grade_min

np.float64(3.5)

In [55]:
# range
grade_range = grade_max - grade_min
grade_range

np.float64(6.300000000000001)

In [56]:
# first quartile (Q1)
grade_q1 = np.percentile(grade,25) # 25%
grade_q1

np.float64(7.300000000000001)

In [57]:
# second quartile (Q2)
grade_q3 = np.percentile(grade,75) # 75%
grade_q3

np.float64(9.025)

In [58]:
# interquartile range (IQR)
grade_iqr = grade_q3 - grade_q1
grade_iqr

np.float64(1.7249999999999996)

In [59]:
# population variance
grade_var = np.var(grade)
grade_var

np.float64(2.635555555555556)

In [60]:
# sample variance
grade_var = np.var(grade,ddof=1)
grade_var

np.float64(2.8751515151515155)

In [61]:
# population standard deviation
grade_std = np.std(grade)
grade_std

np.float64(1.6234394215847896)

In [62]:
# sample standard deviation
grade_std = np.std(grade,ddof=1)
grade_std

np.float64(1.6956271745733245)

## Ice Cream Price List

In [63]:
# ice cream price list dataset
price = np.array([
    [3.5,4,3.75,4.25,3.9,4.1,3.6,4.5,3.8,4.15],
    [203,232,217.5,246.5,226.2,237.8,208.8,261,220.4,240.7]
])
price = price.T
price

array([[  3.5 , 203.  ],
       [  4.  , 232.  ],
       [  3.75, 217.5 ],
       [  4.25, 246.5 ],
       [  3.9 , 226.2 ],
       [  4.1 , 237.8 ],
       [  3.6 , 208.8 ],
       [  4.5 , 261.  ],
       [  3.8 , 220.4 ],
       [  4.15, 240.7 ]])

In [64]:
# mean of USD
usd_mean = np.mean(price,axis=0)[0] # [0] 1st column
usd_mean

np.float64(3.9549999999999996)

In [67]:
# standard deviation of USD
usd_std = np.std(price,ddof=1,axis=0)[0] # [0] 1st column
usd_std

np.float64(0.3068206135331994)

In [68]:
# USD coefficient of variation
usd_cv = usd_std/usd_mean
usd_cv

np.float64(0.0775779048124398)

In [69]:
# mean of PHP
php_mean = np.mean(price,axis=0)[1] # [0] 2nd column
php_mean

np.float64(229.38999999999996)

In [70]:
# standard deviation of PHP
php_std = np.std(price,ddof=1,axis=0)[1] # [1] 2nd column
php_std

np.float64(17.795595584925564)

In [71]:
# PHP coefficient of variation
php_cv = php_std/php_mean
php_cv

np.float64(0.07757790481243981)

## Pooled Standard Deviation

In [112]:
# battery life dataset
battery = np.array([
    ['A','A','A','A','A','B','B','B','B','B','C','C','C','C','C'],
    [12.5,12.8,12.7,13.3,12.6,13.5,14.1,13.9,14.3,13.7,11.8,11.9,12.1,12.2,11.6]
])
battery = battery.T
battery

array([['A', '12.5'],
       ['A', '12.8'],
       ['A', '12.7'],
       ['A', '13.3'],
       ['A', '12.6'],
       ['B', '13.5'],
       ['B', '14.1'],
       ['B', '13.9'],
       ['B', '14.3'],
       ['B', '13.7'],
       ['C', '11.8'],
       ['C', '11.9'],
       ['C', '12.1'],
       ['C', '12.2'],
       ['C', '11.6']], dtype='<U32')

In [113]:
# extract rows where the 1st column is 'A'
model_a = battery[battery[:,0]=='A']
model_a

array([['A', '12.5'],
       ['A', '12.8'],
       ['A', '12.7'],
       ['A', '13.3'],
       ['A', '12.6']], dtype='<U32')

In [114]:
# extract rows where the 1st column is 'B'
model_b = battery[battery[:,0]=='B']
model_b

array([['B', '13.5'],
       ['B', '14.1'],
       ['B', '13.9'],
       ['B', '14.3'],
       ['B', '13.7']], dtype='<U32')

In [115]:
# extract rows where the 1st column is 'C'
model_c = battery[battery[:,0]=='C']
model_c

array([['C', '11.8'],
       ['C', '11.9'],
       ['C', '12.1'],
       ['C', '12.2'],
       ['C', '11.6']], dtype='<U32')

In [118]:
# type of data
type(model_a[0,1])

numpy.str_

In [120]:
# convert string to float
hours_a = model_a[:,1].astype(float)
hours_b = model_b[:,1].astype(float)
hours_c = model_c[:,1].astype(float)

In [121]:
hours_a

array([12.5, 12.8, 12.7, 13.3, 12.6])

In [122]:
# average variance
a_var = np.var(hours_a, ddof=1)
b_var = np.var(hours_b, ddof=1)
c_var = np.var(hours_c, ddof=1)

ave_var = np.mean([a_var,b_var,c_var])
ave_var

np.float64(0.08466666666666678)

In [123]:
# pooled standard deviation is the square root of average variance
pooled_std = np.sqrt(ave_var)
pooled_std

np.float64(0.29097537123726946)