In [None]:
#Initialize

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


groceries = pd.read_csv('resources/datasets/toy_groceries.csv',index_col=0)

display(groceries.head())

In [None]:
#generate groceries dataset

# groceries_df = pd.DataFrame({'produce ($)' : [30.0,50.0,65.0,15.0,20.0],
#                             'meat ($)' : [20.0,0.0,0.0,30.0,20.0],
#                             'dairy ($)' : [10.0,20.0,0.0,20.0,15.0],
#                             'other ($)' : [40.0,25.0,30.0,60.0,30.0]})


# import random

# data = [groceries_df.iloc[j]*(1 + random.random()) for i in range(10) for j in range(5)]

# from datetime import date
# from datetime import timedelta

# base_date = date(2012,1,1)

# dates = [base_date + timedelta(days=random.randint(1,365)) for d in data]

# tmp = pd.DataFrame(data)
    
# tmp['loyalty-card-id'] = tmp.index
# tmp['date'] = dates

# t = tmp.reset_index(drop=True)

# t.index.name = 'Transaction Id'

# t.to_csv("resources/datasets/toy_groceries.csv")

# Numpy

## Creating matrices

In [None]:
import numpy.random as npr

py_x = [[1,2,3,4], [5,6,7,8], [9,10,11,12]]

x = np.array(py_x)
y = np.array([[0,0,0], [1,2,3], [5,6,7], [4,8,9]])

np.arange(20,40,2) #array([20, 22, 24, 26, 28, 30, 32, 34, 36, 38])

np.ones([2,2])  # 2x2 matrix of 1s
np.zeros([3,2]) # 3x2 matrix of 0s 

np.identity(2)  # 2x2 identity matrix

np.random.randn(3,3)  # 3x3 matrix of random floats


## Operations and Functions

In [None]:
x = np.array([[1,2],[3,4]])
y = np.array([[5,6], [7,8]])

#Arithmetic operations with scalars propagate the scalar argument to each element in the array

x ** 0.5  #exponentiation of all elements
1 / x     #multiplicative inversion of all elements
x + 2     #addition to all elements

# Any arithmetic operations between equal-size arrays applies the operation element-wise.  So we have

x - y   #elementwise difference
x + y   #elementwise sum
x * y   #elementwise product
x / y   #elementwise division

### Some Useful Universal functions (ufuncs)

#### Unary
abs, sqrt, square, exp, log, sign, ceil, floor, rint, cos, cosh, sin, ...

#### Binary
add, subtract, multiply, divide, power, maximum, minimum, mod, >, <, & (and) , | (or), ...

### Statistical Methods

sum, mean, std, var, min, max, argmin, argmax, cumsum, cumprod

### Set operations

unique, intersect1d, union1d, setdiff1d

### Linear Algebra

dot, trace, det (determinant), eig (eigenvalues and eigenvectors), inv (inverse), qr (QR decomposition), svd (singular value decomposition), solve (solve the linear system Ax = b), lstsq (least-squares solution to Ax=b).

## Saving arrays to file

Store a single array in a file
np.save('file-name', arr)
np.load('file-name.npy')

store multipe arrays in a file
np.savez('file-name.npz', a=arr, b=arr, ...)      keyword args will label different arrays
my_saved_arrays = np.load('file-name.npz')

my_saved_arrays['b']   will load the second array that was stored in that file

## Vectorized Computation

In [None]:
import random as r

#Random walk with standard python
num_steps = 1000
position = 0
steps = []

for st in range(0,num_steps):
    position += 1 if r.randint(0,1) else -1
    steps.append(position)
    
plt.plot(steps[:100])
plt.show()

#Random walk vectorized
# The key idea is that we construct a vector of the draws (1 or -1) and 
# then use the where function to create the history based on the steps

draws = np.random.randint(0,2, size=num_steps)  #create vector of randomly occurring 0s and 1s

steps = np.where(draws > 0, 1, -1)  #create a vector of 1s and -1s based on randomness from draws

walk = steps.cumsum()   #create a vector of cumulative sums

# Pandas Intro

## Series

A Series is best conceived of as a fixed-length, ordered dict that is is a mapping of index values to data values.

### Creating Series

In [None]:
obj1 = pd.Series([4,5,-7,0])   # index ( range(4)) is automatically created

obj1.values
obj1.index

IQs = pd.Series([94,105,-7,131], index=['John','Jillian','Jesus','Tomas'])

IQs = pd.Series({'John' : 94,'Jillian' : 105,'Jesus' : -7,'Tomas' : 131})

#Give names to the index and the values

IQs.name = 'IQ Scores'
IQs.index.name = 'First Name'

IQs

### Queries and Operations

In [None]:
IQs = pd.Series({'John' : 94,'Jillian' : 105,'Jesus' : -7,'Tomas' : 131})

# Query by index or list of indices
IQs['Tomas'] # 131
IQs[['Tomas','John']]  # A list of indices -> A Series object containing [Thomas : 131, John : 94]

#Filtering by bit mask, IQs > 0 returns index -> boolean representing if the value was > 0
IQs[IQs > 0]

# apply unary operators over values
np.exp(IQs)
IQs * 2

# in binary operations, values are automatically aligned by index label
yr1_revenue = pd.Series({'John' : 94,'Jillian' : 105})
yr2_revenue = pd.Series({'John' : 100,'Jillian' : 125})

yr1_revenue + yr2_revenue

## DataFrame

A DataFrame can be conceived of as a fixed-length, ordered dict that is is a mapping of index values to Series objects.  It can also be thought of as an indexed 2d table of data.

### Creating DataFrames

In [None]:
from sklearn import datasets

#creating data frames

census_pops = [[10000, 10200, 10500, 11000],
               [10100,0 ,10500, 10700],
               [0, 0,20500, 21700, 0, 23000]]

cdata = pd.DataFrame(census_pops,columns=["2010","2011","2012","2013","2014","2015"], index=['Idaho',"Nevada",'Utah'])
display(cdata.head())

#load some toy datasets for future cells
iris_ds = datasets.load_iris()
iris_df = pd.DataFrame(iris_ds.data,columns=iris_ds.feature_names)

house_ds = datasets.load_boston()
house_df = pd.DataFrame(house_ds.data,columns=house_ds.feature_names)

display(iris_df.head())
house_df.head()

### Essential Operations

#### Indexing and filtering

| Type | Notes |
|-|-|
|df[val] | Select single column or sequence of columns; val can be boolean array, slice operator, bit mask DataFrame |
|df.loc[val] | select row or subset of row by _label_|
|df.loc[val1,val2] | select both rows and columns by _label_|
| df.iloc[idx] | select row or subset of rows based on integer position |

In [None]:
iris_df.T   #transpose the frame

iris_df.index
iris_df.columns

#get the data as a numpy array
iris_df.values

#Use iloc function when you want to do position based (as opposed to label based) slicing
#error!!!!
# iris_df[1]

# query for the second row
iris_df.iloc[1]

#get the first 4 rows of the dframe
iris_df.iloc[0:4]

# get the first 4 columns of the dframe
iris_df.iloc[:,0:4] 

# use masks
mask = cdata['2012'] < 12000
display(mask)
cdata[mask]

#### Misc operations

In [None]:
#Operations on data frames

cdata + 200 #broadcasting to add 200 to each element

cdata.add(5000,fill_value=0)  #add 5000, replace NaN/None with 0 before doing so

#add a row's values to each row in a data frame
cdata + cdata.iloc[0]

#add a column's value to each column
cdata.add(cdata['2010'],axis='index')

#sorting
cdata.sort_values(by='2013',ascending=False)

#sort by 2012 pop and then 2013 pop
cdata.sort_values(by=['2012','2013'],ascending=False)

|Function|Description|
|-|-|
|count|Number of non-NA values|
|describe|Compute set of summary statistics for Series or each DataFrame column|
|min, max|Compute minimum and maximum values|
|argmin, argmax|Compute index locations (integers) at which minimum or maximum value obtained, respectively|
|idxmin, idxmax|Compute index labels at which minimum or maximum value obtained, respectively|
|quantile|Compute sample quantile ranging from 0 to 1|
|sum|Sum of values|
|mean|Mean of values|
|median|Arithmetic median (50% quantile) of values|
|mad|Mean absolute deviation from mean value|
|prod|Product of all values|
|var|Sample variance of values|
|std|Sample standard deviation of values|
|skew|Sample skewness (third moment) of values|
|kurt|Sample kurtosis (fourth moment) of values|
|cumsum|Cumulative sum of values|
|cummin, cummax|Cumulative minimum or maximum of values, respectively|
|cumprod|Cumulative product of values|
|diff|Compute first arithmetic difference (useful for time series)|
|pct_change|Compute percent changes|

### Descriptive Statistics

In [None]:
# average value for each column
iris_df.mean()

#average value for each row
iris_df.mean(axis='columns')

#summary statistics for the data
display(iris_df.describe())

#pairwise correlation between columns
iris_df.corr()

#pairwise covariance between columns
iris_df.cov()

#correlation between each column of the DataFrame and the series argument
iris_df.corrwith(iris_df.iloc[:,0])

#get unique values in a series
house_df['PTRATIO'].unique()

#determine the frequency of each value in a series
house_df['PTRATIO'].value_counts().head()

# Loading and Storing data

## Files

### File input

In [None]:
#The csv does not have a header row, need to specify columns (or pass header=None to autogenerate)
col_names = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)','class']
iris = pd.read_csv("resources/iris.csv", names=col_names)

iris.head()

Read file functions have lots of optional parameters to help deal with awkward input data.  Below is a sample of such functions:

| Param | Use case | example |
|-|-|-|
| names | column names | names=['a','b','c'] |
| index_col | which column should be used as the DataFrame's index | index_col='student_id' |
| sep | the field delimiter using a regex | sep='\s+' |
| skiprows | position of rows to skip | skiprows=[0,1,2] |
| na_values| a vector of ~= null values or a map for column specific null values | na_values=['99999999'] |
| nrows | read the first n rows | nrows=8 |
| chunksize | the read function will return an iterator over the file, batch size of chunksize | chunksize=1000|

### File output

In [None]:
#write the a and b columns from iris DataFrame to the 'examples/out.psv' file, using a | separator, replacing null values with the string NULL
iris.to_csv('examples/out.psv', sep='|', na_rep='NULL', columns=['a','b'])

### Misc formats

In [None]:
#JSON
my_json = pd.read_json('examples/example.json') # not a working example
my_json.to_json()  #convert the DataFrame (or Series) to JSON

#HTML
pd.read_html("blah.html") #not a working example

#xml - use once of the many python libraries for reading xml

# Pickle (Python's binary serialization)
iris.to_pickle('examples/pickled_iris')
x = pd.read_pickle('examples/pickled_iris')


#HDF5 Format - file format for storing large quantities of scientific array data
#The HDFStore class works like a dict

store = pd.HDFStore('mydata.h5')

store['obj1'] = iris
store['obj1_col'] = iris['class']

#retrieve the data
x = store['obj1']


#excel files
pd.ExcelFile('examples/ex1.xlsx')

## Web APIs

In [None]:
#requests package is the go to for simplified http interaction

import requests

url = 'https://api.github.com/repos/pandas-dev/pandas/issues'

r = requests.get(url)

# r.status_code
# r.headers['content-type']
# r.encoding
r.json()  #get the contents of the response, assuming json content-type




## Databases

### SQL

In [None]:
#This actually requires quite a bit of work, outside of this notebook, to setup your system for pyodbc ...
# https://tryolabs.com/blog/2012/06/25/connecting-sql-server-database-python-under-ubuntu/

import pyodbc

__sql_connect_phrase = ('DSN=your_dsn;'
                        'DATABASE=your_db;'
                        'UID=your_user;'
                        'PWD=your_password')

sql_conn = pyodbc.connect( __sql_connect_phrase)

# Grab data set from sql
all_data = pd.read_sql_query("SELECT * from ar.db.AccountsReceivable WHERE active='y'", sql_conn)

### Mongo

In [None]:
from pymongo import MongoClient

# To connect to a different mongo instance, pass the mongoDB URI to MongoClient
# format :   'mongodb://[username:password@]host1[:port1][,...hostN[:portN]]][/[database][?options]]'
# example : mongodb://myDBReader:P40ssw0rd@mongodb0.example.com:27017/admin

#checks localhost:27017 for a running mongod instance
client = MongoClient()

db = client['test'] #pdf-transforms is the database

#iterates over the elements from the block-oracle collection ...
docs = [x for x in db['temp'].find()]

# Data Preparation

## Missing and Duplicate Data

In [None]:
from numpy import nan as NA

s = pd.Series(['aardvark', 'artichoke', NA, 'avocado'])

#boolean mask for nil values (can be used to filter data)
s.isnull()

#boolean mask for NOT nil values
s.notnull()

df = pd.DataFrame([[1., 6.0, 3.5], [1.5, NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])

display(df)

#drop rows
df.dropna() #drop any row that contains a null value
df.dropna(how='all') #drop rows that are all null
df.dropna(thresh=2) #drop all rows that do not contain at least 2 non null values

#replace nulls
df.fillna(0)  #replace null values with 0
df.fillna({'col1' : 0.0, 'col2' : 0.1})  #columns specific null value replacement
df.fillna(df.mean())    #replace null values with the column mean


#duplication
df.duplicated() #a (series) boolean mask for duplicate rows
df.drop_duplicates()  #drop duplicate rows

#filter duplicate by k1 and k2 columns, keep last instance of duplicated record
data.drop_duplicates(['k1', 'k2'], keep='last')

## Transformations

### map, replace, rename

In [None]:
display(groceries.head())

#apply a function to each element of a series
groceries['meat ($)'].map(lambda x : "{:6.2f}".format(x))

#apply a function to each element of a dataframe
groceries[['produce ($)','meat ($)']].applymap(lambda x : "{:6.2f}".format(x)).head()

#replace values
groceries.replace(999999,np.nan)
groceries.replace(['999999','-----'],np.nan)
groceries.replace({'99999' : np.nan, '----' : ''})

#modify the index 
groceries.index.map(lambda x: 1 + x)

#rename
groceries.rename(index={0: 2}, columns={'produce ($)': 'produce'}).head()


# Sources

McKinney, Wes. Python for Data Analysis : Data Wrangling with Pandas, NumPy, and IPython, O'Reilly Media, Incorporated, 2017. ProQuest Ebook Central, http://ebookcentral.proquest.com/lib/boisestate/detail.action?docID=5061179.