In [9]:
#Exercise data wrangling 45 min
#følg denne tutorial: data wrangling


#Step 1: Set up your environment.

In [10]:
#Step 2: Import libraries and dataset.


# Pandas for managing datasets
import pandas as pd

In [11]:
# Display floats with 2 decimal places
pd.options.display.float_format = '{:,.2f}'.format
 
# Expand display limits
pd.options.display.max_rows = 200
pd.options.display.max_columns = 100

In [12]:
# Read BNC2 sample dataset
df = pd.read_csv('BNC2_sample.csv',
                 names=['Code', 'Date', 'Open', 'High', 'Low', 
                        'Close', 'Volume', 'VWAP', 'TWAP'])
 
# Display first 5 observations
df.head()

Unnamed: 0,Code,Date,Open,High,Low,Close,Volume,VWAP,TWAP
0,GWA_BTC,2014-04-01,467.28,488.62,467.28,479.56,74776.48,482.76,482.82
1,GWA_BTC,2014-04-02,479.2,494.3,431.32,437.08,114052.96,460.19,465.93
2,GWA_BTC,2014-04-03,437.33,449.74,414.41,445.6,91415.08,432.29,433.28
3,GWA_BTC,2014-04-04,445.18,456.1,429.16,449.81,51147.27,443.46,443.93
4,GWA_BTC,2014-04-05,450.08,464.09,445.16,461.7,28449.19,452.53,452.95


In [13]:
#Step 3: Understand the data.
    #Equivalence in Granularity
    #Equivalence in Units
    #First, see how some codes start with GWA and others with MWA? These are actually completely different types of 
    #indicators according to the documentation page.


# Unique codes in the dataset
print( df.Code.unique() )
 
# ['GWA_BTC' 'GWA_ETH' 'GWA_LTC' 'GWA_XLM' 'GWA_XRP' 'MWA_BTC_CNY'
#  'MWA_BTC_EUR' 'MWA_BTC_GBP' 'MWA_BTC_JPY' 'MWA_BTC_USD' 'MWA_ETH_CNY'
#  'MWA_ETH_EUR' 'MWA_ETH_GBP' 'MWA_ETH_JPY' 'MWA_ETH_USD' 'MWA_LTC_CNY'
#  'MWA_LTC_EUR' 'MWA_LTC_GBP' 'MWA_LTC_JPY' 'MWA_LTC_USD' 'MWA_XLM_CNY'
#  'MWA_XLM_EUR' 'MWA_XLM_USD' 'MWA_XRP_CNY' 'MWA_XRP_EUR' 'MWA_XRP_GBP'
#  'MWA_XRP_JPY' 'MWA_XRP_USD']

['GWA_BTC' 'GWA_ETH' 'GWA_LTC' 'GWA_XLM' 'GWA_XRP' 'MWA_BTC_CNY'
 'MWA_BTC_EUR' 'MWA_BTC_GBP' 'MWA_BTC_JPY' 'MWA_BTC_USD' 'MWA_ETH_CNY'
 'MWA_ETH_EUR' 'MWA_ETH_GBP' 'MWA_ETH_JPY' 'MWA_ETH_USD' 'MWA_LTC_CNY'
 'MWA_LTC_EUR' 'MWA_LTC_GBP' 'MWA_LTC_JPY' 'MWA_LTC_USD' 'MWA_XLM_CNY'
 'MWA_XLM_EUR' 'MWA_XLM_USD' 'MWA_XRP_CNY' 'MWA_XRP_EUR' 'MWA_XRP_GBP'
 'MWA_XRP_JPY' 'MWA_XRP_USD']


In [14]:
# Example of GWA and MWA relationship
df[df.Code.isin(['GWA_BTC', 'MWA_BTC_JPY', 'MWA_BTC_EUR']) 
   & (df.Date == '2018-01-01')]

Unnamed: 0,Code,Date,Open,High,Low,Close,Volume,VWAP,TWAP
1371,GWA_BTC,2018-01-01,14505.89,14505.89,13617.46,14092.74,225906.21,14103.18,14093.73
9074,MWA_BTC_EUR,2018-01-01,11859.35,11859.35,11111.07,11403.92,14933.73,11488.45,11478.08
11838,MWA_BTC_JPY,2018-01-01,1674341.45,1678567.55,1572173.9,1632657.51,68611.95,1632994.4,1631407.66


In [15]:
#Step 4: Filter unwanted observations.
    #In the previous step, we learned that GWA codes are aggregations of the regional MWA codes. Therefore, 
    #to perform our analysis, we only need to keep the global GWA codes:

    
# Number of observations in dataset
print( 'Before:', len(df) )
# Before: 31761
 
# Get all the GWA codes
gwa_codes = [code for code in df.Code.unique() if 'GWA_' in code]
 
# Only keep GWA observations
df = df[df.Code.isin(gwa_codes)]
 
# Number of observations left
print( 'After:', len(df) )
# After: 6309

Before: 31761
After: 6309


In [17]:
#Step 5: Pivot the dataset.
    #For this tutorial, let's keep the VWAP (volume weighted average price) column
    #As you can see, each column in our pivoted dataset now represents the price for one cryptocurrency 
    #and each row contains prices from one date. All the features are now aligned by date.


# Pivot dataset
pivoted_df = df.pivot(index='Date', columns='Code', values='VWAP')
 
# Display examples from pivoted dataset
pivoted_df.tail()

Code,GWA_BTC,GWA_ETH,GWA_LTC,GWA_XLM,GWA_XRP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-19,11826.36,1068.45,195.0,0.51,1.82
2018-01-20,13062.68,1158.71,207.58,0.52,1.75
2018-01-21,12326.23,1108.9,197.36,0.48,1.55
2018-01-22,11397.52,1038.21,184.92,0.47,1.43
2018-01-23,10921.0,992.05,176.95,0.47,1.42
