In [2]:
import numpy as np
import pandas as pd

# Numpy and Pandas with examples

## Read/Write to various file formats

### From CSV

In [4]:
# nap1 = pd.read_csv('../../datasets/NAP/TR02_OUTLET_TINS.csv', dtype={'CR02_DATE_OPENED': pd.Int64Dtype()})
nap1 = pd.read_csv('../../datasets/NAP/TR02_OUTLET_TINS.csv')
nap1.head(3)

Unnamed: 0,CR01_TIN_ID,CR02_EMAIL,CR02_WEB_SITE,CR02_TELEX_NO,CR02_FAX_NUMBER,CR02_TRADE_NAME,CR02_OUTLET_NAME,CR02_DATE_OPENED,CR02_OUTLET_CODE,CG03_OUTLET_TYPE,CR02_DATE_CLOSED,CG04_GROUP_SECTOR,CR02_PHONE_NUMBER,CR02_MOBILE_NUMBER,CG03_OUTLET_STATUS,CR02_FAX_AREA_CODE,CR02_PHONE_AREA_CODE,CR02_AUDIT_TRAIL_USER,CR02_AUDIT_TRAIL_DATE,CR02_AUDIT_TRAIL_ACTION
0,324852794,,,,,,КАТИНА ХРИСТОВА ШАРКОВА,,0,21905,,,,,30603,,,AR GRAO,2012-09-13,Automatic registration
1,324852795,,,,,,РУСА СТОЯНОВА ИВАНОВА,,0,21905,,,,,30603,,,AR GRAO,2012-10-12,Automatic registration
2,324852796,,,,,,СТОЯН ДИМИТРОВ ТОДОРОВ,,0,21905,,,,,30603,,,AR GRAO,2014-04-10,Automatic registration


### From JSON

In [8]:
authors_json_df = pd.read_json('../../datasets/python_books/authors.json')
authors_json_df.head(3)

Unnamed: 0,author,authorUrl
0,Mike Pirnat,http://mike.pirnat.com/
1,David Mertz,http://www.oreilly.com/programming/free/functi...
2,Muhammad Yasoob,http://pythontips.com/


### From SQL

Make sure you have created the database and the user. If you want to follow the examples, you can import '../../datasets/sql_dbs/python_books.sql' like:

`mysql -u userName -p -f < python_books.sql`

In [9]:
from sqlalchemy import create_engine
import pymysql

# create_engine(dialect+driver://username:password@host:port/database)
sql_engine = create_engine('mysql+pymysql://test:test1234@localhost/TestDB', pool_recycle=3600)
dbConn = sql_engine.connect()

authors_sql_df = pd.read_sql("select * from python_books", dbConn);
authors_sql_df


Unnamed: 0,id,author,authorUrl
0,151,TEST AUTHOR,TEST URL


### To CSV

In [10]:
# write to CSV:
authors_json_df.to_csv('./authors.csv', sep=',', encoding='utf-8', index=False)

# let's check the file
authors_csv_df = pd.read_csv('./authors.csv')
authors_csv_df.head(3)

Unnamed: 0,author,authorUrl
0,Mike Pirnat,http://mike.pirnat.com/
1,David Mertz,http://www.oreilly.com/programming/free/functi...
2,Muhammad Yasoob,http://pythontips.com/


### To SQL

In [12]:
# # let's see the row json file
# json_file = '../../datasets/python_books/authors.json'

# f = open(json_file)
# print(f.read(300))

In [14]:
authors_csv_df.head(3)

# authors_csv_df.drop(columns=['Unnamed: 0'], inplace=True)


Unnamed: 0,author,authorUrl
0,Mike Pirnat,http://mike.pirnat.com/
1,David Mertz,http://www.oreilly.com/programming/free/functi...
2,Muhammad Yasoob,http://pythontips.com/


In [15]:
authors_csv_df.to_sql('python_books', con=sql_engine, if_exists='append', index=False)

## Analyse  data in a DataFrame

In [20]:
# remind the nap dataframe
nap1.head()

Unnamed: 0,CR01_TIN_ID,CR02_EMAIL,CR02_WEB_SITE,CR02_TELEX_NO,CR02_FAX_NUMBER,CR02_TRADE_NAME,CR02_OUTLET_NAME,CR02_DATE_OPENED,CR02_OUTLET_CODE,CG03_OUTLET_TYPE,CR02_DATE_CLOSED,CG04_GROUP_SECTOR,CR02_PHONE_NUMBER,CR02_MOBILE_NUMBER,CG03_OUTLET_STATUS,CR02_FAX_AREA_CODE,CR02_PHONE_AREA_CODE,CR02_AUDIT_TRAIL_USER,CR02_AUDIT_TRAIL_DATE,CR02_AUDIT_TRAIL_ACTION
0,324852794,,,,,,КАТИНА ХРИСТОВА ШАРКОВА,,0,21905,,,,,30603,,,AR GRAO,2012-09-13,Automatic registration
1,324852795,,,,,,РУСА СТОЯНОВА ИВАНОВА,,0,21905,,,,,30603,,,AR GRAO,2012-10-12,Automatic registration
2,324852796,,,,,,СТОЯН ДИМИТРОВ ТОДОРОВ,,0,21905,,,,,30603,,,AR GRAO,2014-04-10,Automatic registration
3,324852797,,,,,,ИВАНКА ХРИСТОВА НИКОЛОВА,,0,21905,,,,,30601,,,DM GRAO,2006-12-26,Data migration
4,324852798,,,,,,КАТЕРИНА ТОДОРОВА МРЪЧЕВА,,0,21905,,,,,30603,,,AR GRAO,2013-08-01,Automatic registration


In [35]:
# check for missing (NaN) values
nap1.isna().sum()

CR01_TIN_ID                     0
CR02_EMAIL                 455063
CR02_WEB_SITE              455063
CR02_TELEX_NO              455063
CR02_FAX_NUMBER            455063
CR02_TRADE_NAME            455063
CR02_OUTLET_NAME                0
CR02_DATE_OPENED           455062
CR02_OUTLET_CODE                0
CG03_OUTLET_TYPE                0
CR02_DATE_CLOSED           455063
CG04_GROUP_SECTOR          455063
CR02_PHONE_NUMBER          455063
CR02_MOBILE_NUMBER         455063
CG03_OUTLET_STATUS              0
CR02_FAX_AREA_CODE         455063
CR02_PHONE_AREA_CODE       455063
CR02_AUDIT_TRAIL_USER           0
CR02_AUDIT_TRAIL_DATE           0
CR02_AUDIT_TRAIL_ACTION         0
dtype: int64

In [40]:
# get df detailed info:
nap1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455063 entries, 0 to 455062
Data columns (total 20 columns):
CR01_TIN_ID                455063 non-null int64
CR02_EMAIL                 0 non-null float64
CR02_WEB_SITE              0 non-null float64
CR02_TELEX_NO              0 non-null float64
CR02_FAX_NUMBER            0 non-null float64
CR02_TRADE_NAME            0 non-null float64
CR02_OUTLET_NAME           455063 non-null object
CR02_DATE_OPENED           1 non-null object
CR02_OUTLET_CODE           455063 non-null int64
CG03_OUTLET_TYPE           455063 non-null int64
CR02_DATE_CLOSED           0 non-null float64
CG04_GROUP_SECTOR          0 non-null float64
CR02_PHONE_NUMBER          0 non-null float64
CR02_MOBILE_NUMBER         0 non-null float64
CG03_OUTLET_STATUS         455063 non-null int64
CR02_FAX_AREA_CODE         0 non-null float64
CR02_PHONE_AREA_CODE       0 non-null float64
CR02_AUDIT_TRAIL_USER      455063 non-null object
CR02_AUDIT_TRAIL_DATE      455063 non-null

## Search data in DataFrame

In [39]:
# search string in cells:
import re
regex = re.compile(r'КАТИНА.*ХРИСТОВА', re.IGNORECASE)
mask = nap1.CR02_OUTLET_NAME.str.contains(regex)
nap1[mask].head()

Unnamed: 0,CR01_TIN_ID,CR02_EMAIL,CR02_WEB_SITE,CR02_TELEX_NO,CR02_FAX_NUMBER,CR02_TRADE_NAME,CR02_OUTLET_NAME,CR02_DATE_OPENED,CR02_OUTLET_CODE,CG03_OUTLET_TYPE,CR02_DATE_CLOSED,CG04_GROUP_SECTOR,CR02_PHONE_NUMBER,CR02_MOBILE_NUMBER,CG03_OUTLET_STATUS,CR02_FAX_AREA_CODE,CR02_PHONE_AREA_CODE,CR02_AUDIT_TRAIL_USER,CR02_AUDIT_TRAIL_DATE,CR02_AUDIT_TRAIL_ACTION
0,324852794,,,,,,КАТИНА ХРИСТОВА ШАРКОВА,,0,21905,,,,,30603,,,AR GRAO,2012-09-13,Automatic registration
1739,324849340,,,,,,КАТИНА ДИМИТРОВА ХРИСТОВА,,0,21905,,,,,30601,,,DM GRAO,2006-12-26,Data migration
27322,324881024,,,,,,КАТИНА ХРИСТОВА ХРИСТОВА,,0,21905,,,,,30603,,,AR GRAO,2017-03-09,Automatic registration
71182,324924802,,,,,,КАТИНА ИВАНОВА ХРИСТОВА,,0,21905,,,,,30603,,,AR GRAO,2010-02-10,Automatic registration
87742,324937995,,,,,,КАТИНА АНАСТАСОВА ХРИСТОВА,,0,21905,,,,,30603,,,AR GRAO,2017-06-15,Automatic registration
