# NumPy
Numeric Python
* Collection of values
* NumPy arrays are substitute of lists but can not contain different type of data in a single array
* Easy and fast numeric operations

### Usefulness

In [4]:
height = 1.8
weight = 50

In [5]:
bmi = weight / height ** 2

In [6]:
bmi

15.432098765432098

In [2]:
height = [1.73, 1.68, 1.71, 1.89, 1.79]
weight = [65.4, 59.2, 63.6, 88.4, 68.7]

In [11]:
weight / height ** 2

TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'

In [6]:
import numpy as np

In [17]:
#pip! install numpy

In [7]:
np_height = np.array(height)
np_weight = np.array(weight)

In [8]:
type(np_height)

numpy.ndarray

In [9]:
type(height)

list

In [10]:
np_bmi = np_weight / np_height ** 2

In [12]:
list(np_bmi)

[21.85171572722109,
 20.97505668934241,
 21.750282138093777,
 24.74734749867025,
 21.44127836209856]

#### Difference from lists

In [13]:
np.array([1.0, "is", True])  # Array contains only one type of data

array(['1.0', 'is', 'True'], dtype='<U32')

In [14]:
list_1 = [1, 2, 3]
list_2 = [4, 5, 6]

In [15]:
list_1 + list_2

[1, 2, 3, 4, 5, 6]

In [16]:
summed = [list_1[0] + list_2[0], list_1[1] + list_2[1], list_1[2] + list_2[2]]

In [17]:
summed

[5, 7, 9]

In [18]:
summed_loop = []
for i,v in enumerate(list_1): 
    for j,m in enumerate(list_2):
        if i == j:
            sum_up = v + m
            summed_loop.append(sum_up)
print(summed_loop)

[5, 7, 9]


In [19]:
array_1 = np.array(list_1)  # Arrays adds up the values
array_2 = np.array(list_2)

In [23]:
array_1 * array_2

array([ 4, 10, 18])

In [21]:
type(array_1)

numpy.ndarray

### 2D NumPy Arrays

In [24]:
np_2d = np.array([[1.73, 1.68, 1.71, 1.89, 1.79], [65.4, 59.2, 63.6, 88.4, 68.7]])

In [25]:
np_2d

array([[ 1.73,  1.68,  1.71,  1.89,  1.79],
       [65.4 , 59.2 , 63.6 , 88.4 , 68.7 ]])

In [26]:
np_2d.shape

(2, 5)

In [33]:
np_2d[0]

array([1.73, 1.68, 1.71, 1.89, 1.79])

In [27]:
np_2d[0][1]

1.68

In [28]:
np_2d[0,1]  # works wither way

1.68

In [30]:
a = np_2d[:,1:3]

In [31]:
a

array([[ 1.68,  1.71],
       [59.2 , 63.6 ]])

In [32]:
np_2d[1, :]

array([65.4, 59.2, 63.6, 88.4, 68.7])

In [33]:
np_3d = np.array([[1.73, 1.68, 1.71, 1.89, 1.79], [65.4, 59.2, 63.6, 88.4, 68.7], [10, 20, 30, 40, 50]])

In [34]:
np_3d

array([[ 1.73,  1.68,  1.71,  1.89,  1.79],
       [65.4 , 59.2 , 63.6 , 88.4 , 68.7 ],
       [10.  , 20.  , 30.  , 40.  , 50.  ]])

In [37]:
np_3d[:1, 2:4]

array([[1.71, 1.89]])

### Generating datasets

Simulating a dataset of height and weight of all the residents in a city. Suppose the population of this city is 5000.

In [39]:
height = np.round(np.random.normal(1.75, 0.20, 5000), 2)

In [40]:
height

array([1.31, 1.89, 1.67, ..., 1.25, 1.84, 1.39])

In [41]:
len(height)

5000

In [42]:
weight = np.round(np.random.normal(60.32, 15, 5000), 2)

In [44]:
np_city = np.column_stack((height, weight))

In [45]:
np_city

array([[ 1.31, 64.28],
       [ 1.89, 76.44],
       [ 1.67, 86.42],
       ...,
       [ 1.25, 62.03],
       [ 1.84, 61.1 ],
       [ 1.39, 79.22]])

##### 1. What are the mean, median, and standard deviation of the height of the people in the city?

In [48]:
np_city[:, 0]

array([1.31, 1.89, 1.67, ..., 1.25, 1.84, 1.39])

In [50]:
np_height = np.array(np_city[:,0])

In [51]:
np.mean(np_height)

1.7461

In [52]:
np.median(np_height)

1.75

In [52]:
print(np.median(np_height))

1.75


In [53]:
print(np.std(np_height))

0.19753873884380246


##### 2. Is there any correlation between height and weight?

In [55]:
corr = np.corrcoef(np_city[:,0], np_city[:,1])

In [56]:
corr

array([[ 1.        , -0.00227708],
       [-0.00227708,  1.        ]])

### Exploring AppleStore data file with NumPy array:

In [57]:
from csv import reader

In [58]:
opened_file = open('AppleStore.csv', encoding='utf-8')
read_file = reader(opened_file)
apps_data = list(read_file)

In [59]:
type(apps_data)

list

In [61]:
apps = np.array(apps_data)

In [62]:
apps

array([['id', 'track_name', 'size_bytes', ..., 'ipadSc_urls.num',
        'lang.num', 'vpp_lic'],
       ['284882215', 'Facebook', '389879808', ..., '1', '29', '1'],
       ['389801252', 'Instagram', '113954816', ..., '0', '29', '1'],
       ...,
       ['1070052833', 'Go!Go!Cat!', '91468800', ..., '2', '2', '1'],
       ['1081295232', 'Suppin Detective: Expose their true visage!',
        '83026944', ..., '0', '1', '1'],
       ['977965019', 'みんなのお弁当 by クックパッド ~お弁当をレシピ付きで記録・共有~', '51174400',
        ..., '0', '1', '1']], dtype='<U232')

In [72]:
apps_data[0]

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices.num',
 'ipadSc_urls.num',
 'lang.num',
 'vpp_lic']

In [63]:
rating = np.array(apps[1:,7])

In [64]:
rating

array(['3.5', '4.5', '4.5', ..., '0', '0', '0'], dtype='<U232')

In [65]:
price = np.array(apps[1:, 4])

In [66]:
price

array(['0', '0', '0', ..., '0', '0', '0'], dtype='<U232')

In [67]:
np.mean(price)

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U232'), dtype('<U232')) -> None

In [68]:
p =  price.astype(float) # changing data type of numpy array

In [69]:
p

array([0., 0., 0., ..., 0., 0., 0.])

In [70]:
print(np.mean(p))

1.726217868556343


In [71]:
print(np.median(p))

0.0


In [72]:
print(np.std(p))

5.8326005341785105


In [73]:
r =  rating.astype(float)

In [74]:
print(np.mean(r))

3.526955675976101


In [75]:
print(np.corrcoef(p, r))

[[1.       0.046601]
 [0.046601 1.      ]]


In [87]:
print(np.corrcoef(apps[1:,4], apps[1:,7]))

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U232'), dtype('<U232')) -> None

In [77]:
rating_price = np.column_stack((r, p))

In [78]:
rating_price

array([[3.5, 0. ],
       [4.5, 0. ],
       [4.5, 0. ],
       ...,
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ]])

In [79]:
print(np.corrcoef(rating_price[:,0], rating_price[:,1]))

[[1.       0.046601]
 [0.046601 1.      ]]


#### Pandas
In the AppleStore data some colmns are numeric and some are strings. When we read it as list of lists, or as NumPy array we need to subset or slice the dataset to convert one colmns into float in order to do numeric operations. That needs more coding. Cleaning the dataset, like dropping duplicates, handling missing values is difficult in list or array method. Any kind of data file can be accessed with Pandas.

In [None]:
#pip! install pandas

In [80]:
import pandas as pd

In [81]:
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, 3.286, 9.597, 1.221],
"population":[200.4, 143.5, 1252, 1357, 52.98] }

In [84]:
t = pd.DataFrame(rating_price)

In [82]:
brics = pd.DataFrame(dict)

In [83]:
brics

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98


In [96]:
# Keys have become column labels and values have become column values, column by column.

In [90]:
r = brics[['country', 'area']]

In [98]:
brics.dtypes

country        object
capital        object
area          float64
population    float64
dtype: object

# CSV

In [None]:
pd.read_csv
pd.read_excel

In [92]:
apps_data = pd.read_csv('AppleStore.csv')

In [95]:
apps_data.head(10)

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,284882215,Facebook,389879808,USD,0.0,2974676,212,3.5,3.5,95,4+,Social Networking,37,1,29,1
1,389801252,Instagram,113954816,USD,0.0,2161558,1289,4.5,4.0,10.23,12+,Photo & Video,37,0,29,1
2,529479190,Clash of Clans,116476928,USD,0.0,2130805,579,4.5,4.5,9.24.12,9+,Games,38,5,18,1
3,420009108,Temple Run,65921024,USD,0.0,1724546,3842,4.5,4.0,1.6.2,9+,Games,40,5,1,1
4,284035177,Pandora - Music & Radio,130242560,USD,0.0,1126879,3594,4.0,4.5,8.4.1,12+,Music,37,4,1,1
5,429047995,Pinterest,74778624,USD,0.0,1061624,1814,4.5,4.0,6.26,12+,Social Networking,37,5,27,1
6,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1
7,553834731,Candy Crush Saga,222846976,USD,0.0,961794,2453,4.5,4.5,1.101.0,4+,Games,43,5,24,1
8,324684580,Spotify Music,132510720,USD,0.0,878563,8253,4.5,4.5,8.4.3,12+,Music,37,5,18,1
9,343200656,Angry Birds,175966208,USD,0.0,824451,107,4.5,3.0,7.4.0,4+,Games,38,0,10,1


In [97]:
apps_data.tail(2)

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
7195,1081295232,Suppin Detective: Expose their true visage!,83026944,USD,0.0,0,0,0.0,0.0,1.0.3,12+,Entertainment,40,0,1,1
7196,977965019,みんなのお弁当 by クックパッド ~お弁当をレシピ付きで記録・共有~,51174400,USD,0.0,0,0,0.0,0.0,1.4.0,4+,Food & Drink,37,0,1,1


In [100]:
apps_data.dtypes

id                    int64
track_name           object
size_bytes            int64
currency             object
price               float64
rating_count_tot      int64
rating_count_ver      int64
user_rating         float64
user_rating_ver     float64
ver                  object
cont_rating          object
prime_genre          object
sup_devices.num       int64
ipadSc_urls.num       int64
lang.num              int64
vpp_lic               int64
dtype: object

In [99]:
apps_data.isna()

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7192,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7193,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7194,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7195,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [104]:
apps_data.isna().sum()

id                  0
track_name          0
size_bytes          0
currency            0
price               0
rating_count_tot    0
rating_count_ver    0
user_rating         0
user_rating_ver     0
ver                 0
cont_rating         0
prime_genre         0
sup_devices.num     0
ipadSc_urls.num     0
lang.num            0
vpp_lic             0
dtype: int64

#### summary statistics 

* continuous variable

In [100]:
apps_data['user_rating'].mean()

3.526955675976101

In [101]:
apps_data['user_rating'].mode()

0    4.5
Name: user_rating, dtype: float64

In [102]:
apps_data['user_rating'].median()

4.0

In [103]:
apps_data['user_rating'].std()

1.5179475936298683

#### Frequency table

In [104]:
apps_data['prime_genre'].value_counts()

prime_genre
Games                3862
Entertainment         535
Education             453
Photo & Video         349
Utilities             248
Health & Fitness      180
Productivity          178
Social Networking     167
Lifestyle             144
Music                 138
Shopping              122
Sports                114
Book                  112
Finance               104
Travel                 81
News                   75
Weather                72
Reference              64
Food & Drink           63
Business               57
Navigation             46
Medical                23
Catalogs               10
Name: count, dtype: int64

In [160]:
### Reading SAARC

In [106]:
saarc = pd.read_csv("3a79e33b-0b7f-47c5-af75-3c9c7ddb3ba6_Data.csv")

In [107]:
saarc

Unnamed: 0,Time,Time Code,Country Name,Country Code,Population density (people per sq. km of land area) [EN.POP.DNST],"Population, total [SP.POP.TOTL]",GDP growth (annual %) [NY.GDP.MKTP.KD.ZG],GDP per capita (constant 2015 US$) [NY.GDP.PCAP.KD],GDP (current US$) [NY.GDP.MKTP.CD],Gini index [SI.POV.GINI]
0,2023,YR2023,Afghanistan,AFG,..,41454760.0,2.266944,378.066303,17152230000.0,..
1,2023,YR2023,Bangladesh,BGD,..,171467000.0,5.775112,1885.377336,437415300000.0,..
2,2023,YR2023,Bhutan,BTN,..,786385.0,4.882594,3488.844491,3019254000.0,..
3,2023,YR2023,India,IND,..,1438070000.0,9.190755,2270.905181,3638489000000.0,..
4,2023,YR2023,Maldives,MDV,..,525994.0,4.726392,11415.939509,6590894000.0,..
5,2023,YR2023,Sri Lanka,LKA,..,22037000.0,-2.329848,3964.919874,83716140000.0,..
6,2023,YR2023,Pakistan,PAK,..,247504500.0,-0.039839,1616.396701,337885500000.0,..
7,2023,YR2023,Nepal,NPL,..,29694610.0,1.982548,1136.427693,41047770000.0,..
8,,,,,,,,,,
9,,,,,,,,,,


In [109]:
saarc = saarc.iloc[:8, :]

In [123]:
saarc

Unnamed: 0,Time,Time Code,Country Name,Country Code,Population density (people per sq. km of land area) [EN.POP.DNST],population,gdp_growth,gdp_per_capita,GDP (current US$) [NY.GDP.MKTP.CD],Gini index [SI.POV.GINI],population_million
0,2023,YR2023,Afghanistan,AFG,..,41454760.0,2.266944,378.066303,17152230000.0,..,41.454761
1,2023,YR2023,Bangladesh,BGD,..,171467000.0,5.775112,1885.377336,437415300000.0,..,171.46699
2,2023,YR2023,Bhutan,BTN,..,786385.0,4.882594,3488.844491,3019254000.0,..,0.786385
3,2023,YR2023,India,IND,..,1438070000.0,9.190755,2270.905181,3638489000000.0,..,1438.069596
4,2023,YR2023,Maldives,MDV,..,525994.0,4.726392,11415.939509,6590894000.0,..,0.525994
5,2023,YR2023,Sri Lanka,LKA,..,22037000.0,-2.329848,3964.919874,83716140000.0,..,22.037
6,2023,YR2023,Pakistan,PAK,..,247504500.0,-0.039839,1616.396701,337885500000.0,..,247.504495
7,2023,YR2023,Nepal,NPL,..,29694610.0,1.982548,1136.427693,41047770000.0,..,29.694614


In [124]:
saarc.shape

(8, 11)

In [112]:
saarc.dtypes

Time                                                                  object
Time Code                                                             object
Country Name                                                          object
Country Code                                                          object
Population density (people per sq. km of land area) [EN.POP.DNST]     object
Population, total [SP.POP.TOTL]                                      float64
GDP growth (annual %) [NY.GDP.MKTP.KD.ZG]                            float64
GDP per capita (constant 2015 US$) [NY.GDP.PCAP.KD]                  float64
GDP (current US$) [NY.GDP.MKTP.CD]                                   float64
Gini index [SI.POV.GINI]                                              object
dtype: object

In [113]:
saarc = saarc.rename(columns = {"Population, total [SP.POP.TOTL]" : "population"})

In [None]:
GDP growth (annual %) [NY.GDP.MKTP.KD.ZG]

In [115]:
saarc = saarc.rename(columns = {"GDP growth (annual %) [NY.GDP.MKTP.KD.ZG]" : "gdp_growth"})

In [117]:
saarc = saarc.rename(columns = {"GDP per capita (constant 2015 US$) [NY.GDP.PCAP.KD]" : "gdp_per_capita"})

In [121]:
# Finding out summary statistics of a column
print(saarc["gdp_per_capita"].mean().round(2))
print(saarc["population"].mean().round(2))

3269.61
243942479.38


In [122]:
saarc['population_million'] = saarc['population']/1000000

In [125]:
brics

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98


In [126]:
saarc

Unnamed: 0,Time,Time Code,Country Name,Country Code,Population density (people per sq. km of land area) [EN.POP.DNST],population,gdp_growth,gdp_per_capita,GDP (current US$) [NY.GDP.MKTP.CD],Gini index [SI.POV.GINI],population_million
0,2023,YR2023,Afghanistan,AFG,..,41454760.0,2.266944,378.066303,17152230000.0,..,41.454761
1,2023,YR2023,Bangladesh,BGD,..,171467000.0,5.775112,1885.377336,437415300000.0,..,171.46699
2,2023,YR2023,Bhutan,BTN,..,786385.0,4.882594,3488.844491,3019254000.0,..,0.786385
3,2023,YR2023,India,IND,..,1438070000.0,9.190755,2270.905181,3638489000000.0,..,1438.069596
4,2023,YR2023,Maldives,MDV,..,525994.0,4.726392,11415.939509,6590894000.0,..,0.525994
5,2023,YR2023,Sri Lanka,LKA,..,22037000.0,-2.329848,3964.919874,83716140000.0,..,22.037
6,2023,YR2023,Pakistan,PAK,..,247504500.0,-0.039839,1616.396701,337885500000.0,..,247.504495
7,2023,YR2023,Nepal,NPL,..,29694610.0,1.982548,1136.427693,41047770000.0,..,29.694614


In [127]:
brics_country = set(brics['country'].unique())

In [128]:
brics_country

{'Brazil', 'China', 'India', 'Russia', 'South Africa'}

In [129]:
saarc_country = set(saarc['Country Name'].unique())

In [130]:
brics_country - saarc_country

{'Brazil', 'China', 'Russia', 'South Africa'}

In [131]:
saarc_country - brics_country

{'Afghanistan',
 'Bangladesh',
 'Bhutan',
 'Maldives',
 'Nepal',
 'Pakistan',
 'Sri Lanka'}

In [132]:
saarc_country & brics_country

{'India'}

In [137]:
brics

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98


In [133]:
brics['population'].mean() 

601.176

In [134]:
saarc['population_million'].mean().round(2)

243.94

In [135]:
brics['population'].mean() == saarc['population_million'].mean().round(2)

False

In [136]:
brics['population'].mean() > saarc['population_million'].mean().round(2)

True