In [None]:
import numpy as np
import pandas as pd

# Numpy and Pandas with examples

## Read/Write to various file formats

### From CSV

In [2]:
# drinks = pd.read_csv('../../datasets/NAP/TR02_OUTLET_TINS.csv', dtype={'CR02_DATE_OPENED': pd.Int64Dtype()})
drinks = pd.read_csv('../../datasets/various/drinks.csv')
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF


### From JSON

In [3]:
authors_json_df = pd.read_json('../../datasets/python_books/authors.json')
authors_json_df.head(3)

Unnamed: 0,author,authorUrl
0,Mike Pirnat,http://mike.pirnat.com/
1,David Mertz,http://www.oreilly.com/programming/free/functi...
2,Muhammad Yasoob,http://pythontips.com/


### From SQL

Make sure you have created the database and the user. If you want to follow the examples, you can import '../../datasets/sql_dbs/python_books.sql' like:

`mysql -u userName -p -f < python_books.sql`

In [4]:
from sqlalchemy import create_engine
import pymysql

# create_engine(dialect+driver://username:password@host:port/database)
sql_engine = create_engine('mysql+pymysql://test:test1234@localhost/TestDB', pool_recycle=3600)
dbConn = sql_engine.connect()

authors_sql_df = pd.read_sql("select * from python_books", dbConn);
authors_sql_df


Unnamed: 0,id,author,authorUrl
0,151,TEST AUTHOR,TEST URL
1,152,Mike Pirnat,http://mike.pirnat.com/
2,153,David Mertz,http://www.oreilly.com/programming/free/functi...
3,154,Muhammad Yasoob,http://pythontips.com/
4,155,B. Miller & D. Ranum,http://reputablejournal.com
...,...,...,...
71,222,Tim Cox,https://www.packtpub.com/packt/free-ebook/pyth...
72,223,Massimo Di Pierro,https://twitter.com/mdipierro
73,224,Charles R. Severance,http://www.dr-chuck.com/
74,225,Caleb Hattingh,http://www.oreilly.com/pub/au/6789


### To CSV

In [5]:
# write to CSV:
authors_json_df.to_csv('./authors.csv', sep=',', encoding='utf-8', index=False)

# let's check the file
authors_csv_df = pd.read_csv('./authors.csv')
authors_csv_df.head(3)

Unnamed: 0,author,authorUrl
0,Mike Pirnat,http://mike.pirnat.com/
1,David Mertz,http://www.oreilly.com/programming/free/functi...
2,Muhammad Yasoob,http://pythontips.com/


## Analyse  data in a DataFrame

### Common dataset stats

In [6]:
# get info about the DataFrame object:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       170 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


In [7]:
# get common stats about the data:
drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [8]:
# check for missing (NaN) values
drinks.isna().sum()

country                          0
beer_servings                    0
spirit_servings                  0
wine_servings                    0
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64

## Search data in DataFrame

### Select single column

In [9]:
# select single column - square bracket notation:
# use this, when labels are not valid Python identifiers (i.e. contains spaces)
drinks['country']

0      Afghanistan
1          Albania
2          Algeria
3          Andorra
4           Angola
          ...     
188      Venezuela
189        Vietnam
190          Yemen
191         Zambia
192       Zimbabwe
Name: country, Length: 193, dtype: object

In [10]:
## select single column - dot notation:
# this is the prefered way, if the column label is valid Python identifier
drinks.country

0      Afghanistan
1          Albania
2          Algeria
3          Andorra
4           Angola
          ...     
188      Venezuela
189        Vietnam
190          Yemen
191         Zambia
192       Zimbabwe
Name: country, Length: 193, dtype: object

### Select multiple columns

In [11]:
# pass column names as list:
drinks[ ['country', 'continent'] ]

Unnamed: 0,country,continent
0,Afghanistan,AS
1,Albania,EU
2,Algeria,AF
3,Andorra,EU
4,Angola,AF
...,...,...
188,Venezuela,SA
189,Vietnam,AS
190,Yemen,AS
191,Zambia,AF


### Slice rows and columns with loc

(label-based)

In [12]:
# get rows from 10 to 20(inclusive), and all columns
drinks.loc[10:20,:]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
10,Azerbaijan,21,46,5,1.3,EU
11,Bahamas,122,176,51,6.3,
12,Bahrain,42,63,7,2.0,AS
13,Bangladesh,0,0,0,0.0,AS
14,Barbados,143,173,36,6.3,
15,Belarus,142,373,42,14.4,EU
16,Belgium,295,84,212,10.5,EU
17,Belize,263,114,8,6.8,
18,Benin,34,4,13,1.1,AF
19,Bhutan,23,0,0,0.4,AS


In [13]:
# get all rows for columns from 'beer_servings' till 'wine_servings' (inclusive)
drinks.loc[:, 'beer_servings': 'wine_servings']

Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0,0,0
1,89,132,54
2,25,0,14
3,245,138,312
4,217,57,45
...,...,...,...
188,333,100,3
189,111,2,1
190,6,0,0
191,32,19,4


In [14]:
# get rows from 10 to 20 for columns from 'beer_servings' till 'wine_servings' (inclusive)
drinks.loc[10:20, 'beer_servings': 'wine_servings']

Unnamed: 0,beer_servings,spirit_servings,wine_servings
10,21,46,5
11,122,176,51
12,42,63,7
13,0,0,0
14,143,173,36
15,142,373,42
16,295,84,212
17,263,114,8
18,34,4,13
19,23,0,0


### Slice columns with iloc

(index-based)

In [15]:
# get first 3 rows:
drinks.iloc[0:3,:]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF


In [16]:
# get first 3 columns
drinks.iloc[:, 0:3]

Unnamed: 0,country,beer_servings,spirit_servings
0,Afghanistan,0,0
1,Albania,89,132
2,Algeria,25,0
3,Andorra,245,138
4,Angola,217,57
...,...,...,...
188,Venezuela,333,100
189,Vietnam,111,2
190,Yemen,6,0
191,Zambia,32,19


In [17]:
# get first 3 rows and last 3 columns:
drinks.iloc[0:3, -3:]

Unnamed: 0,wine_servings,total_litres_of_pure_alcohol,continent
0,0,0.0,AS
1,54,4.9,EU
2,14,0.7,AF


### Find data in DataFrame, using Boolean mask

In [18]:
# get rows for Bulgaria:
mask = drinks.country == 'Bulgaria'
drinks[mask]

# or shortly:
# drinks[drinks.country == 'Bulgaria']

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
25,Bulgaria,231,252,94,10.3,EU


In [19]:
# get rows for country names starting with 'B' and ending on 'A':
# regex search in cells:
import re

regex = re.compile(r'^b.*a$', re.IGNORECASE)

mask = drinks.country.str.contains(regex)
drinks[mask].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
20,Bolivia,167,41,8,3.8,SA
21,Bosnia-Herzegovina,76,173,8,4.6,EU
22,Botswana,173,35,35,5.4,AF
25,Bulgaria,231,252,94,10.3,EU


In [20]:
# find the country with max wine_servings in all data

# get max wine_servings value
max_wine_servings = drinks.wine_servings.max()

# display the row with max_wine_servings:
drinks[drinks.wine_servings == max_wine_servings]

# get only the value in country column:
drinks[drinks.wine_servings == max_wine_servings].country

61    France
Name: country, dtype: object

In [21]:
# find the country with max wine_servings in SA

# get max wine_servings value for SA
SA_max_wine_servings = drinks[drinks.continent=='SA'].wine_servings.max()
SA_max_wine_servings

# get the row for country with SA_max_wine_servings
drinks[(drinks.continent=='SA') & (drinks.wine_servings==SA_max_wine_servings)]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
6,Argentina,193,25,221,8.3,SA


In [23]:
# find the max wine_servings per each continent
drinks.groupby('continent').wine_servings.max()

continent
AF    233
AS    123
EU    370
OC    212
SA    221
Name: wine_servings, dtype: int64