In [12]:
import numpy as np
import pandas as pd

# Numpy and Pandas with examples

## Read/Write to various file formats

### From CSV

In [13]:
# drinks = pd.read_csv('../../datasets/NAP/TR02_OUTLET_TINS.csv', dtype={'CR02_DATE_OPENED': pd.Int64Dtype()})
drinks = pd.read_csv('../../datasets/various/drinks.csv')
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF


### From JSON

In [14]:
authors_json_df = pd.read_json('../../datasets/python_books/authors.json')
authors_json_df.head(3)

Unnamed: 0,author,authorUrl
0,Mike Pirnat,http://mike.pirnat.com/
1,David Mertz,http://www.oreilly.com/programming/free/functi...
2,Muhammad Yasoob,http://pythontips.com/


### From SQL

Make sure you have created the database and the user. If you want to follow the examples, you can import '../../datasets/sql_dbs/python_books.sql' like:

`mysql -u userName -p -f < python_books.sql`

In [15]:
from sqlalchemy import create_engine
import pymysql

# create_engine(dialect+driver://username:password@host:port/database)
sql_engine = create_engine('mysql+pymysql://test:test1234@localhost/TestDB', pool_recycle=3600)
dbConn = sql_engine.connect()

authors_sql_df = pd.read_sql("select * from python_books", dbConn);
authors_sql_df


Unnamed: 0,id,author,authorUrl
0,151,TEST AUTHOR,TEST URL
1,152,Mike Pirnat,http://mike.pirnat.com/
2,153,David Mertz,http://www.oreilly.com/programming/free/functi...
3,154,Muhammad Yasoob,http://pythontips.com/
4,155,B. Miller & D. Ranum,http://reputablejournal.com
...,...,...,...
71,222,Tim Cox,https://www.packtpub.com/packt/free-ebook/pyth...
72,223,Massimo Di Pierro,https://twitter.com/mdipierro
73,224,Charles R. Severance,http://www.dr-chuck.com/
74,225,Caleb Hattingh,http://www.oreilly.com/pub/au/6789


### To CSV

In [16]:
# write to CSV:
authors_json_df.to_csv('./authors.csv', sep=',', encoding='utf-8', index=False)

# let's check the file
authors_csv_df = pd.read_csv('./authors.csv')
authors_csv_df.head(3)

Unnamed: 0,author,authorUrl
0,Mike Pirnat,http://mike.pirnat.com/
1,David Mertz,http://www.oreilly.com/programming/free/functi...
2,Muhammad Yasoob,http://pythontips.com/


## Analyse  data in a DataFrame

### Common dataset stats

In [20]:
# get info about the DataFrame object:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       170 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


In [21]:
# get common stats about the data:
drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [22]:
# check for missing (NaN) values
drinks.isna().sum()

country                          0
beer_servings                    0
spirit_servings                  0
wine_servings                    0
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64

## Search data in DataFrame

In [28]:
# regex search string in cells:
import re

# get rows for country names starting with 'B' and ending on 'A':
regex = re.compile(r'^b.*a$', re.IGNORECASE)

mask = drinks.country.str.contains(regex)
drinks[mask].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
20,Bolivia,167,41,8,3.8,SA
21,Bosnia-Herzegovina,76,173,8,4.6,EU
22,Botswana,173,35,35,5.4,AF
25,Bulgaria,231,252,94,10.3,EU
