In [5]:
import pandas as pd
import numpy as np
import os
os.chdir(r'C:\Users\dell\PycharmProjects\MachineLearning\Pandas\datasets')

In [8]:
pd.set_option('display.max_rows', 5)

# asfreq

In [None]:
#Convert a Time Series to a valid frequency
#E.g: Time Series: 06/10/2001, 12/10/2001
#    -> asfreq('2D') --> 06/10/2001, 08/10/2001, 10/10/2001, 12/10/2001

days = pd.to_datetime(['06/10/2001', '12/10/2001'], format = '%d/%m/%Y')
#upsample

sale = pd.Series([100, 98], index = days)
sale.asfreq('2D', method = 'ffill')

In [4]:
sale.asfreq('2D', fill_value = -1)

2001-10-06    100
2001-10-08     -1
2001-10-10     -1
2001-10-12     98
Freq: 2D, dtype: int64

# rolling

In [6]:
#downsample

In [9]:
g = pd.read_csv('germany.csv', index_col = 0, parse_dates = True)
g

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-01-01,1069.18400,,,
2006-01-02,1380.52100,,,
...,...,...,...,...
2017-12-30,1215.44897,721.247,7.467,728.714
2017-12-31,1107.11488,721.176,19.980,741.156


In [10]:
#data from 2016
g.loc['2016']

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-01,1060.366,107.209,18.563,125.772
2016-01-02,1192.698,409.622,9.406,419.028
...,...,...,...,...
2016-12-30,1291.044,239.257,48.546,287.803
2016-12-31,1212.568,315.063,34.580,349.643


In [11]:
#average consumption over 7 days(index)
g.rolling('7D')['Consumption'].sum()

Date
2006-01-01    1069.18400
2006-01-02    2449.70500
                 ...    
2017-12-30    8457.49890
2017-12-31    8422.85648
Name: Consumption, Length: 4383, dtype: float64

In [13]:
df = pd.read_csv('./electric_production.csv')
df['DATE'] = pd.to_datetime(df['DATE'])
#Rolling on a column as Date
weekly_production = df.rolling('7D', on = 'DATE')[df.columns[1]].sum()
weekly_production

0       72.5052
1       70.6720
         ...   
395    114.7212
396    129.4048
Name: IPG2211A2N, Length: 397, dtype: float64

In [14]:
df.rolling?

# Shifting data

In [None]:
#shift(): shift data
#tshift(): shift index

## shift

```python
DataFrame.shift(periods=1, freq=None, axis=0, fill_value=None)
```

In [16]:
scores = pd.DataFrame(np.random.randint(5,10, (5,5)))
scores

Unnamed: 0,0,1,2,3,4
0,7,7,5,8,8
1,9,5,6,5,6
2,9,5,5,6,8
3,8,6,7,6,5
4,8,8,8,5,9


In [15]:
shift_3_rows = scores.shift(3, axis = 'index') #shift 3 rows(data)
shift_3_rows

Unnamed: 0,0,1,2,3,4
0,,,,,
1,,,,,
2,,,,,
3,7.0,8.0,6.0,9.0,5.0
4,9.0,7.0,9.0,6.0,5.0


In [17]:
#shift 2 columns and fill NaN with -1 (working with data, not time)
shift_2_columns = scores.shift(2, axis = 'columns', fill_value = -1)
shift_2_columns

Unnamed: 0,0,1,2,3,4
0,-1,-1,7,7,5
1,-1,-1,9,5,6
2,-1,-1,9,5,5
3,-1,-1,8,6,7
4,-1,-1,8,8,8


# tshift

```python
DataFrame.tshift(periods=1, freq=None, axis=0)
```

In [27]:
scores.index = pd.date_range('1/1/2020', '1/5/2020', freq = 'D') 
scores

Unnamed: 0,0,1,2,3,4
2020-01-01,7,7,5,8,8
2020-01-02,9,5,6,5,6
2020-01-03,9,5,5,6,8
2020-01-04,8,6,7,6,5
2020-01-05,8,8,8,5,9


In [31]:
#Shift Index 2 days #working with date
scores.tshift(freq = '2D')

Unnamed: 0,0,1,2,3,4
2020-01-03,7,7,5,8,8
2020-01-04,9,5,6,5,6
2020-01-05,9,5,5,6,8
2020-01-06,8,6,7,6,5
2020-01-07,8,8,8,5,9


# resample

```python
data.resample(
    rule,
    how=None,
    axis=0,
    fill_method=None,
    closed=None,
    label=None,
    convention='start',
    kind=None,
    loffset=None,
    limit=None,
    base=0,
    on=None,
    level=None,
)
```

In [40]:
#Intuition: pd.resample('3D') --> group 3 consecutive days into one sample, apply an aggregate function


# Convenience method for frequency conversion and resampling of time series.
# Object must have a datetime-like index (DatetimeIndex, PeriodIndex, or TimedeltaIndex),
 # or pass datetime-like values to the on or level keyword.
days = pd.date_range('10/06/2001', periods = 9, freq = 'D')
data = pd.Series(range(9), index = days)
data

2001-10-06    0
2001-10-07    1
             ..
2001-10-13    7
2001-10-14    8
Freq: D, Length: 9, dtype: int64

In [41]:
#Down sample

downsample = data.resample('3D')
downsample

DatetimeIndexResampler [freq=<3 * Days>, axis=0, closed=left, label=left, convention=start, base=0]

In [42]:
#Down sample to 3 sample and sum up values for each sample
#NOTE: the label is by default the left edge of each sample
compressed = data.resample('3D').sum()
compressed

2001-10-06     3
2001-10-09    12
2001-10-12    21
Freq: 3D, dtype: int64

In [43]:
#To chose the label to be the right edge of each sample, use:
label_right = data.resample('3D', label = 'right').sum()
label_right

2001-10-09     3
2001-10-12    12
2001-10-15    21
Freq: 3D, dtype: int64

In [45]:
#Upsample with frequency = 8 hours, forward fill NaN
upsample_ffill = data.resample('8H').ffill()
upsample_ffill

2001-10-06 00:00:00    0
2001-10-06 08:00:00    0
                      ..
2001-10-13 16:00:00    7
2001-10-14 00:00:00    8
Freq: 8H, Length: 25, dtype: int64

In [46]:
#upsample backward fill
upsample_bfill = data.resample('8H').bfill()
upsample_bfill

2001-10-06 00:00:00    0
2001-10-06 08:00:00    1
                      ..
2001-10-13 16:00:00    8
2001-10-14 00:00:00    8
Freq: 8H, Length: 25, dtype: int64

In [47]:
#Resample on MultiIndex
days = pd.date_range('1/1/2000', periods=4, freq='D')
data = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
df = pd.DataFrame(data, index = pd.MultiIndex.from_product([days, ['Morning', 'Afternoon']]))
df

Unnamed: 0,Unnamed: 1,price,volume
2000-01-01,Morning,10,50
2000-01-01,Afternoon,11,60
...,...,...,...
2000-01-04,Morning,17,40
2000-01-04,Afternoon,19,50


In [48]:
#specify the level keyword to resample on MultiIndex
downsample = df.resample('2D', level = 0).sum()
downsample

Unnamed: 0,price,volume
2000-01-01,43,250
2000-01-03,68,240


In [49]:
#resample based on a column of the DataFrame: set the keyword: on = 'column name'

sale = pd.DataFrame({
'Phone': np.random.randint(50, 100, 12),
'Date': pd.date_range('09/24/2019', periods = 12, freq = 'D')
})
#Down sample to 3 samples
sale = sale.resample('4D', on = 'Date').sum()
sale

Unnamed: 0_level_0,Phone
Date,Unnamed: 1_level_1
2019-09-24,318
2019-09-28,295
2019-10-02,257


# to_period

In [36]:
df = pd.read_csv('./wind.data', sep = '\s+', parse_dates = [[0,1,2]])

def autocorrect(DateIndex):
    x = DateIndex.year
    x = x - 100 if x > 2000 else x
    return pd.Timestamp(x, DateIndex.month, DateIndex.day)
df['Yr_Mo_Dy'] = df['Yr_Mo_Dy'].apply(autocorrect)
df['Yr_Mo_Dy'] = pd.to_datetime(df['Yr_Mo_Dy'])
df.set_index('Yr_Mo_Dy', inplace = True)
df

Unnamed: 0_level_0,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
Yr_Mo_Dy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1961-01-01,15.04,14.96,13.17,9.29,,9.87,13.67,10.25,10.83,12.58,18.50,15.04
1961-01-02,14.71,,10.83,6.50,12.62,7.67,11.50,10.04,9.79,9.67,17.54,13.83
...,...,...,...,...,...,...,...,...,...,...,...,...
1978-12-30,18.50,14.04,21.29,9.13,12.75,9.71,18.08,12.87,12.46,12.12,14.67,28.79
1978-12-31,20.33,17.41,27.29,9.59,12.08,10.13,19.25,11.63,11.58,11.38,12.08,22.08


In [37]:
#NOTE: we can have the same result as below by using df.resample(offset).mean()
#but the datetime index by using resample always includes year, month, day

#If you convert to period then use groupby, then the datetime index only includes offset
#e.g df.groupby(df.index.to_period('A')).mean() then the datetime index will only be year

#Downsample to yearly frequency for each location
yearly = df.groupby(df.index.to_period('A')).mean()
yearly

Unnamed: 0_level_0,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
Yr_Mo_Dy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1961,12.299583,10.351796,11.362369,6.958227,10.881763,7.729726,9.733923,8.858788,8.647652,9.835577,13.502795,13.680773
1962,12.246923,10.110438,11.732712,6.960440,10.657918,7.393068,11.020712,8.793753,8.316822,9.676247,12.930685,14.323956
...,...,...,...,...,...,...,...,...,...,...,...,...
1977,13.099616,11.144493,12.627836,6.073945,10.003836,8.586438,11.523205,8.378384,9.098192,8.821616,13.459068,16.590849
1978,12.504356,11.044274,11.380000,6.082356,10.167233,7.650658,9.489342,8.800466,9.089753,8.301699,12.967397,16.771370


In [39]:
#compare to resample
df.resample('A').mean()

Unnamed: 0_level_0,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
Yr_Mo_Dy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1961-12-31,12.299583,10.351796,11.362369,6.958227,10.881763,7.729726,9.733923,8.858788,8.647652,9.835577,13.502795,13.680773
1962-12-31,12.246923,10.110438,11.732712,6.960440,10.657918,7.393068,11.020712,8.793753,8.316822,9.676247,12.930685,14.323956
...,...,...,...,...,...,...,...,...,...,...,...,...
1977-12-31,13.099616,11.144493,12.627836,6.073945,10.003836,8.586438,11.523205,8.378384,9.098192,8.821616,13.459068,16.590849
1978-12-31,12.504356,11.044274,11.380000,6.082356,10.167233,7.650658,9.489342,8.800466,9.089753,8.301699,12.967397,16.771370


In [38]:
#Downsample to monthly frequency for each location
df.groupby(df.index.to_period('M')).mean()

Unnamed: 0_level_0,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
Yr_Mo_Dy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1961-01,14.841333,11.988333,13.431613,7.736774,11.072759,8.588065,11.184839,9.245333,9.085806,10.107419,13.880968,14.703226
1961-02,16.269286,14.975357,14.441481,9.230741,13.852143,10.937500,11.890714,11.846071,11.821429,12.714286,18.583214,15.411786
...,...,...,...,...,...,...,...,...,...,...,...,...
1978-11,16.151667,14.802667,13.508000,7.317333,11.475000,8.743000,11.492333,9.657333,10.701333,10.676000,17.404667,20.723000
1978-12,16.175484,13.748065,15.635161,7.094839,11.398710,9.241613,12.077419,10.194839,10.616774,11.028710,13.859677,21.371613
