In [4]:
import numpy as np
import pandas as pd
import warnings
import random
import matplotlib.pyplot as plt

%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
path = 'datasets/employee_list.parquet'
df = pd.read_parquet(path)

In [3]:
df.head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


### A few different methods within the `dt` accessor.

In [4]:
# day of month
print("Date in month: ", df.dob.dt.day.tolist()[:3])

print("Hour of day: ", df.dob.dt.hour.tolist()[:3])

# week number
print("Week number: ", df.dob.dt.weekofyear.tolist()[:3])

# day of year
print("Day number of year: ", df.dob.dt.dayofyear.tolist()[:3])

# day of week
print("Day of week: ", df.dob.dt.day_name().tolist()[:3])

# year
print("Day of week: ", df.dob.dt.year.tolist()[:3])

Date in month:  [3, 9, 29]
Hour of day:  [0, 0, 0]
Week number:  [26, 32, 31]
Day number of year:  [184, 222, 210]
Day of week:  ['Sunday', 'Saturday', 'Wednesday']
Day of week:  [1983, 1980, 1987]


### FILTER DATAFRAME BASED ON TIMESTAMP

##### Find employees born after 1990-01-01

In [123]:
ts = pd.to_datetime('1/1/1990')

df.loc[df.dob >= ts].head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
10,361949,Laura Lane,Wilson and Sons,India,1990-07-13,31,Developer,89915,True
80,703509,Lisa Phillips,Spears-Brown,Japan,1990-09-05,31,Management,103309,False
97,628844,Elizabeth Thomas,Spears-Brown,Cayman Islands,1990-06-30,31,System Architect,77442,True


In [124]:
ts = pd.to_datetime('1990-01-01')

df.loc[df.dob >= ts].head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
10,361949,Laura Lane,Wilson and Sons,India,1990-07-13,31,Developer,89915,True
80,703509,Lisa Phillips,Spears-Brown,Japan,1990-09-05,31,Management,103309,False
97,628844,Elizabeth Thomas,Spears-Brown,Cayman Islands,1990-06-30,31,System Architect,77442,True


##### Find employees born 1990 or later or before 1970

In [125]:
f1 = df.dob >= '1990'
f2 = df.dob <= '1970'

df.loc[f1 | f2].head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
6,646135,Jonathan Brown,Wilson and Sons,USA,1969-06-05,52,Developer,77451,False
10,361949,Laura Lane,Wilson and Sons,India,1990-07-13,31,Developer,89915,True
13,869838,Benjamin Snyder,Spears-Brown,Japan,1968-04-10,53,System Architect,133943,True
21,207726,Bruce Crawford,Spears-Brown,Venezuela,1969-01-04,53,Consulting,78198,False


### Youngest employee

In [127]:
df.loc[df.dob == df.dob.max()]

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
80,703509,Lisa Phillips,Spears-Brown,Japan,1990-09-05,31,Management,103309,False


### Oldest employee

In [128]:
df.loc[df.dob == df.dob.min()]

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
34,267914,Julie Craig,Wilson and Sons,Venezuela,1968-02-08,53,System Architect,131134,True


### Time between birth of oldest and youngest employee

In [129]:
last = df.dob.max()
first = df.dob.min()

print('Timedelta: ', (last - first))
print('Days: ', (last - first).days)
print('Years: ', round(((last - first).days/365), ndigits=1))

Timedelta:  8245 days 00:00:00
Days:  8245
Years:  22.6


### Filtering with the datetime objects as the index:

In [134]:
df_dt = df.set_index('dob').sort_index()
df_dt.head()

Unnamed: 0_level_0,employee_number,name,company,country,age,department,salary,has_parking_space
dob,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1968-02-08,267914,Julie Craig,Wilson and Sons,Venezuela,53,System Architect,131134,True
1968-04-10,869838,Benjamin Snyder,Spears-Brown,Japan,53,System Architect,133943,True
1968-11-06,172167,Linda Page,Spears-Brown,Israel,53,Finance,98920,False
1968-12-15,100008,Susan Horn,"Hernandez, Cunningham and Clark",Suriname,53,System Architect,86788,True
1969-01-04,207726,Bruce Crawford,Spears-Brown,Venezuela,53,Consulting,78198,False


Employees born in 1969

In [136]:
df_dt['1969']

Unnamed: 0_level_0,employee_number,name,company,country,age,department,salary,has_parking_space
dob,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1969-01-04,207726,Bruce Crawford,Spears-Brown,Venezuela,53,Consulting,78198,False
1969-01-22,791480,Andrew Rivera,"Hernandez, Cunningham and Clark",Israel,53,Finance,86820,True
1969-03-04,267447,Seth Smith,Spears-Brown,Germany,52,System Architect,115653,False
1969-03-27,674256,Charles Williams,Spears-Brown,India,52,System Architect,85485,False
1969-06-05,646135,Jonathan Brown,Wilson and Sons,USA,52,Developer,77451,False
1969-06-11,589974,Martha Martin,"Hernandez, Cunningham and Clark",Cayman Islands,52,Finance,98668,False


Employees born June 1969

In [137]:
df_dt['1969-06']

Unnamed: 0_level_0,employee_number,name,company,country,age,department,salary,has_parking_space
dob,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1969-06-05,646135,Jonathan Brown,Wilson and Sons,USA,52,Developer,77451,False
1969-06-11,589974,Martha Martin,"Hernandez, Cunningham and Clark",Cayman Islands,52,Finance,98668,False


Employees born in the period June 1969 to August 1971

In [147]:
df_dt['1969-06':'1971-08']

Unnamed: 0_level_0,employee_number,name,company,country,age,department,salary,has_parking_space
dob,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1969-06-05,646135,Jonathan Brown,Wilson and Sons,USA,52,Developer,77451,False
1969-06-11,589974,Martha Martin,"Hernandez, Cunningham and Clark",Cayman Islands,52,Finance,98668,False
1970-04-19,254128,Larry Robinson,Wilson and Sons,India,51,Management,89521,True
1970-05-09,187411,Sophia Wallace,Spears-Brown,Suriname,51,Management,129093,True
1971-04-04,311479,Steven Andrews,Spears-Brown,India,50,Finance,77945,False
1971-08-21,471885,Kimberly Williams,"Hernandez, Cunningham and Clark",Cayman Islands,50,Consulting,118495,True
1971-08-24,701315,Michael Fischer,Wilson and Sons,India,50,System Architect,106773,False


### ADD 5 DAYS TO EVERY DATE IN SERIES

In [132]:
dates = df.dob.head(3)
dates

0   1983-07-03
1   1980-08-09
2   1987-07-29
Name: dob, dtype: datetime64[ns]

In [133]:
dates + pd.DateOffset(5)

0   1983-07-08
1   1980-08-14
2   1987-08-03
Name: dob, dtype: datetime64[ns]

### How to convert a series of date-strings to a timeseries

### How to get the day of month, week number, day of year and day of week from a series of date strings?
Get the day of month, week number, day of year and day of week from ser

In [41]:
# Input
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Solution
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))

# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())

Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


# Parse different date/datetime formats
- https://strftime.org/
- https://coincodex.com/crypto/ethereum/historical-data/

Both Python and pandas are able to recognize different date/datetime formats and parse then automatically into datetime objects

In [3]:
# Input
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Solution 1
from dateutil.parser import parse
print(ser.map(lambda x: parse(x)), end='\n\n')

# Solution 2
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]



0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

If we need to parse a string that pandas will not recognize by itself we need to specify the format our strings are in.

In [17]:
ser = pd.Series(['2020-03-13 04-PM', '2020-03-13 05-PM', '2020-03-13 06-PM'])
ser_dt = pd.to_datetime(ser, format='%Y-%m-%d %I-%p')
ser_dt

0   2020-03-13 16:00:00
1   2020-03-13 17:00:00
2   2020-03-13 18:00:00
dtype: datetime64[ns]

In this case pandas would be able to read the date format in the csv file (`Jan-27-2022`) but I included it just to show how to parse dates when reading a csv file.

In [183]:
pd.options.display.float_format = '{:,.1f}'.format

d_parser = lambda x: pd.datetime.strptime(x, '%b-%d-%Y')
df = pd.read_csv('datasets/ethereum_2021_price_data.csv', parse_dates=['Date'], date_parser=d_parser)


#### Group the values by month and show the following:
- High -> max
- Low -> min
- Volume -> sum
- Open, Close, Market Cap -> mean

In [189]:
mapper = {
    'Open': 'mean', 
    'High': 'max', 
    'Low': 'min', 
    'Close': 'mean', 
    'Volume': 'sum', 
    'Market Cap': 'mean'
}

grouper = pd.Grouper(key='Date', axis=0, freq='M')
df.groupby(grouper).agg(mapper)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,743.1,754.3,722.4,730.9,23706827592.0,84369986354.0
2021-01-31,1202.9,1463.1,718.1,1224.8,1879221973503.0,138079919636.6
2021-02-28,1699.6,2024.8,1304.7,1705.1,1656843616320.0,195000571428.6
2021-03-31,1734.9,1982.1,1453.3,1748.6,1241825252395.0,200060032258.1
2021-04-30,2303.7,2955.6,1953.4,2334.4,1751819517372.0,266638400000.0
2021-05-31,3143.9,4362.9,1752.6,3137.6,2364815535928.0,364069548387.1
2021-06-30,2337.3,2890.5,1725.2,2319.3,1256082384028.0,270186000000.0
2021-07-31,2135.0,2694.0,1728.2,2149.5,1040299866231.0,249889322580.6
2021-08-31,3104.0,3797.5,2463.8,3144.9,1131183119795.0,365068322580.6
2021-09-30,3341.9,4015.3,2718.2,3324.0,930652332320.0,392587566666.7


An alternative way of doing the same thing

In [190]:
mapper = {
    'Open': 'mean', 
    'High': 'max', 
    'Low': 'min', 
    'Close': 'mean', 
    'Volume': 'sum', 
    'Market Cap': 'mean'
}
df.resample('M', on='Date', closed='right').agg(mapper)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,743.1,754.3,722.4,730.9,23706827592.0,84369986354.0
2021-01-31,1202.9,1463.1,718.1,1224.8,1879221973503.0,138079919636.6
2021-02-28,1699.6,2024.8,1304.7,1705.1,1656843616320.0,195000571428.6
2021-03-31,1734.9,1982.1,1453.3,1748.6,1241825252395.0,200060032258.1
2021-04-30,2303.7,2955.6,1953.4,2334.4,1751819517372.0,266638400000.0
2021-05-31,3143.9,4362.9,1752.6,3137.6,2364815535928.0,364069548387.1
2021-06-30,2337.3,2890.5,1725.2,2319.3,1256082384028.0,270186000000.0
2021-07-31,2135.0,2694.0,1728.2,2149.5,1040299866231.0,249889322580.6
2021-08-31,3104.0,3797.5,2463.8,3144.9,1131183119795.0,365068322580.6
2021-09-30,3341.9,4015.3,2718.2,3324.0,930652332320.0,392587566666.7


We can also use the resample method on dataframe with a datetime index

In [188]:
mapper = {
    'Open': 'mean', 
    'High': 'max', 
    'Low': 'min', 
    'Close': 'mean', 
    'Volume': 'sum', 
    'Market Cap': 'mean'
}

df.set_index('Date').resample('M', closed='right').agg(mapper)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,743.1,754.3,722.4,730.9,23706827592.0,84369986354.0
2021-01-31,1202.9,1463.1,718.1,1224.8,60620063661.4,138079919636.6
2021-02-28,1699.6,2024.8,1304.7,1705.1,59172986297.1,195000571428.6
2021-03-31,1734.9,1982.1,1453.3,1748.6,40058879109.5,200060032258.1
2021-04-30,2303.7,2955.6,1953.4,2334.4,58393983912.4,266638400000.0
2021-05-31,3143.9,4362.9,1752.6,3137.6,76284372126.7,364069548387.1
2021-06-30,2337.3,2890.5,1725.2,2319.3,41869412800.9,270186000000.0
2021-07-31,2135.0,2694.0,1728.2,2149.5,33558060201.0,249889322580.6
2021-08-31,3104.0,3797.5,2463.8,3144.9,36489778057.9,365068322580.6
2021-09-30,3341.9,4015.3,2718.2,3324.0,31021744410.7,392587566666.7


Set the date column as the index

In [None]:
df.set_index('Date', inplace=True)

Find the average closing price from March and April 2021.

In [154]:
df_dt['2021-03':'2021-04'].Close.mean()

2133.90001

#### Find the following for closing price:
- change in closing price between one date to the following date
- percentage change in closing price between one date to the following date
- 2 period moving average (rolling mean)

In [50]:

df_sample = df[['Date', 'Close']].sample(10).set_index('Date').sort_index()

df_sample['Closing Price Change '] = df_sample.Close.diff(periods=1)
df_sample['Closing Price % Change '] = df_sample.Close.pct_change(periods=1)
df_sample['Closing Price 2 Period MA'] = df_sample.Close.rolling(window=2).mean()

df_sample

Unnamed: 0_level_0,Close,Closing Price Change,Closing Price % Change,Closing Price 2 Period MA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-04,2108.7,,,
2021-04-18,2167.4,58.8,0.0,2138.0
2021-04-28,2765.4,598.0,0.3,2466.4
2021-06-07,2514.8,-250.7,-0.1,2640.1
2021-06-10,2353.5,-161.3,-0.1,2434.1
2021-06-20,1895.5,-458.0,-0.2,2124.5
2021-10-09,3433.8,1538.3,0.8,2664.6
2021-10-22,4161.5,727.7,0.2,3797.6
2021-11-22,4355.1,193.6,0.0,4258.3
2021-11-29,4641.6,286.5,0.1,4498.4


### Create a datetime index

In [5]:
b1 = [random.random() for i in range (30)]
b2 = pd.date_range('2020-06-01', periods=30, freq='1d')
df = pd.DataFrame({'M': b1}, index=b2)
df.head()

Unnamed: 0,M
2020-06-01,0.072125
2020-06-02,0.617119
2020-06-03,0.452774
2020-06-04,0.826642
2020-06-05,0.586282


In [None]:
pd.da