## Exploring a DataFrame

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("Data/sales_data_with_stores.csv")

## Exercise 1 - display first n rows

In [2]:
df.head()

Unnamed: 0,store,product_group,product_code,stock_qty,cost,price,last_week_sales,last_month_sales
0,Violet,PG2,4187,498,420.76,569.91,13,58
1,Rose,PG2,4195,473,545.64,712.41,16,58
2,Violet,PG2,4204,968,640.42,854.91,22,88
3,Daisy,PG2,4219,241,869.69,1034.55,14,45
4,Daisy,PG2,4718,1401,12.54,26.59,50,285


In [3]:
df.head(3)

Unnamed: 0,store,product_group,product_code,stock_qty,cost,price,last_week_sales,last_month_sales
0,Violet,PG2,4187,498,420.76,569.91,13,58
1,Rose,PG2,4195,473,545.64,712.41,16,58
2,Violet,PG2,4204,968,640.42,854.91,22,88


## Exercise 2 - display last n rows

In [4]:
df.tail()

Unnamed: 0,store,product_group,product_code,stock_qty,cost,price,last_week_sales,last_month_sales
995,Daisy,PG4,8048,415,11.99,11.39,28,60
996,Daisy,PG4,8050,-10,1.32,3.32,14,11
997,Violet,PG2,952,5388,37.71,61.74,331,1041
998,Violet,PG2,1307,44996,31.44,42.74,1772,6394
999,Violet,PG5,3018,1697,4.68,18.99,19,52


In [5]:
df.tail(4)

Unnamed: 0,store,product_group,product_code,stock_qty,cost,price,last_week_sales,last_month_sales
996,Daisy,PG4,8050,-10,1.32,3.32,14,11
997,Violet,PG2,952,5388,37.71,61.74,331,1041
998,Violet,PG2,1307,44996,31.44,42.74,1772,6394
999,Violet,PG5,3018,1697,4.68,18.99,19,52


## Exercise 3 - size

In [6]:
df.shape

(1000, 8)

In [7]:
df.size

8000

In [8]:
len(df)

1000

## Exercise 4 - columns

In [9]:
df.columns

Index(['store', 'product_group', 'product_code', 'stock_qty', 'cost', 'price',
       'last_week_sales', 'last_month_sales'],
      dtype='object')

In [10]:
list(df.columns)

['store',
 'product_group',
 'product_code',
 'stock_qty',
 'cost',
 'price',
 'last_week_sales',
 'last_month_sales']

## Exercise 5 - index

In [11]:
df.index

RangeIndex(start=0, stop=1000, step=1)

## Exercise 6 - data types

In [12]:
df.dtypes

store                object
product_group        object
product_code          int64
stock_qty             int64
cost                float64
price               float64
last_week_sales       int64
last_month_sales      int64
dtype: object

## Exercise 7 - describe function

In [13]:
df.describe()

Unnamed: 0,product_code,stock_qty,cost,price,last_week_sales,last_month_sales
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,5095.086,6012.505,45.95207,67.06351,76.918,282.0
std,2801.119752,129789.2,114.169658,144.104975,197.39394,741.959041
min,16.0,-123.0,0.54,0.66,12.0,11.0
25%,2647.75,476.0,6.585,13.29,18.75,61.0
50%,4969.5,1007.5,13.09,23.74,31.0,110.5
75%,7653.25,2105.25,33.5675,56.99,64.0,242.0
max,9972.0,4104542.0,1243.0,1500.05,3222.0,12353.0


## Exercise 8 - describe function

* Customize percentiles

In [14]:
df.describe(percentiles=[0.1, 0.9])

Unnamed: 0,product_code,stock_qty,cost,price,last_week_sales,last_month_sales
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,5095.086,6012.505,45.95207,67.06351,76.918,282.0
std,2801.119752,129789.2,114.169658,144.104975,197.39394,741.959041
min,16.0,-123.0,0.54,0.66,12.0,11.0
10%,1677.9,265.6,3.149,7.12,14.0,38.9
50%,4969.5,1007.5,13.09,23.74,31.0,110.5
90%,9148.5,3947.2,89.57,132.91,155.0,555.8
max,9972.0,4104542.0,1243.0,1500.05,3222.0,12353.0


## Exercise 9 - describe function

* For columns with object data type

In [15]:
df.describe(include=[object])

Unnamed: 0,store,product_group
count,1000,1000
unique,3,6
top,Daisy,PG4
freq,470,349


## Exercise 10 - describe function

* Include only floats

In [16]:
df.describe(percentiles=[.4, .7], include=float)

Unnamed: 0,cost,price
count,1000.0,1000.0
mean,45.95207,67.06351
std,114.169658,144.104975
min,0.54,0.66
40%,9.938,18.99
50%,13.09,23.74
70%,25.223,44.64
max,1243.0,1500.05


## Exercise 11 - memory usage

* The memory_usage function returns memory usage in bytes

In [17]:
df.memory_usage()

Index                128
store               8000
product_group       8000
product_code        8000
stock_qty           8000
cost                8000
price               8000
last_week_sales     8000
last_month_sales    8000
dtype: int64

In [18]:
df.memory_usage().sum()

64128

In [19]:
df.index

RangeIndex(start=0, stop=1000, step=1)

## Conclusion

Exploring a DataFrame in terms of:

* How it looks (i.e. first n rows)
* Columns and row index
* Shape and size
* Data types
* Basic statistics
* Memory usage