<a href="https://colab.research.google.com/github/PippleNL/DSMediorSligro/blob/main/Time_Series_Analysis%2C_lecture_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Time Series Analysis, lecture #1

## Initial data evaluation

In [11]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [12]:
# input the data set
url = 'https://raw.githubusercontent.com/PippleNL/DSMediorSligro/main/data/house_prices_netherlands.csv'
df = pd.read_csv(url)
df.head(5)

Unnamed: 0,DATE,House Prices Netherlands
0,1971-01-01,12.8348
1,1971-04-01,11.5181
2,1971-07-01,9.4702
3,1971-10-01,10.3687
4,1972-01-01,11.4196


In [13]:
# describe the data set
df.describe()

Unnamed: 0,House Prices Netherlands
count,209.0
mean,6.596726
std,9.375665
min,-16.4666
25%,1.2758
50%,5.5933
75%,11.3689
max,45.6748


In [14]:
# check for missing values
df.isnull().sum()

DATE                        2
House Prices Netherlands    0
dtype: int64

In [15]:
# drop missing values
df = df.dropna(how='any')

In [20]:
# checking for duplicates
df.duplicated().sum()

0

In [19]:
df = df.drop_duplicates()

In [22]:
# Extreme outliers
from scipy import stats

In [29]:
q_low = df["price"].quantile(0.01)
q_high = df["price"].quantile(0.99)

In [32]:
df = df[(df["price"] < q_high) & (df["price"] > q_low)]

In [33]:
df.describe()

Unnamed: 0,price
count,197.0
mean,6.154441
std,7.958226
min,-11.1747
25%,1.4682
50%,5.2645
75%,10.8411
max,33.1352


## Datetime basics

### Python's datetime

In [34]:
from datetime import datetime

In [37]:
date_time = datetime(year=2022, month=10, day=13, second=16)

In [38]:
date_time

datetime.datetime(2022, 10, 13, 0, 0, 16)

In [42]:
date_time.month

10

### numpy's data type datetime64

In [43]:
import numpy as np

In [54]:
np.array(['1995-01-10', '2003-10-12', '2022-03-05'], dtype='datetime64[s]')

array(['1995-01-10T00:00:00', '2003-10-12T00:00:00',
       '2022-03-05T00:00:00'], dtype='datetime64[s]')

In [59]:
np.arange('1995-01-10', '2003-10-12', 7, dtype='datetime64[Y]')

array(['1995', '2002'], dtype='datetime64[Y]')

### Pandas' datetime

In [60]:
import pandas as pd

In [61]:
pd.date_range('2001-12-02', periods=5, freq='D')

DatetimeIndex(['2001-12-02', '2001-12-03', '2001-12-04', '2001-12-05',
               '2001-12-06'],
              dtype='datetime64[ns]', freq='D')

In [73]:
pd.to_datetime('01 01, 2018', format='%d %m, %Y')

Timestamp('2018-01-01 00:00:00')

In [91]:
pd.to_datetime(['Jan 01, 2018'])

DatetimeIndex(['2018-01-01'], dtype='datetime64[ns]', freq=None)

## Create some random data

In [86]:
data =np.random.randn(3,2)

In [83]:
date_index = pd.DatetimeIndex(np.array(['1995-01-10', '2003-10-12', '2022-03-05'], dtype='datetime64[D]'))
date_index

DatetimeIndex(['1995-01-10', '2003-10-12', '2022-03-05'], dtype='datetime64[ns]', freq=None)

In [89]:
df = pd.DataFrame(data, date_index, columns=['car_prices', 'house_prices'])
df.head()

Unnamed: 0,car_prices,house_prices
1995-01-10,0.667752,2.100782
2003-10-12,0.092065,-0.899866
2022-03-05,0.230078,0.169697


## Time resampling

In [90]:
import pandas as pd

In [93]:
url_resampling = 'https://raw.githubusercontent.com/PippleNL/DSMediorSligro/main/data/starbucks.csv'
df_resampling = pd.read_csv(url_resampling)
df_resampling.head()

Unnamed: 0,Date,Close,Volume
0,2015-01-02,38.0061,6906098
1,2015-01-05,37.2781,11623796
2,2015-01-06,36.9748,7664340
3,2015-01-07,37.8848,9732554
4,2015-01-08,38.4961,13170548


In [96]:
# Read in data with date column as index
df_resampling = pd.read_csv(url_resampling, index_col='Date')
df_resampling.index

Index(['2015-01-02', '2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
       '2015-01-09', '2015-01-12', '2015-01-13', '2015-01-14', '2015-01-15',
       ...
       '2018-12-17', '2018-12-18', '2018-12-19', '2018-12-20', '2018-12-21',
       '2018-12-24', '2018-12-26', '2018-12-27', '2018-12-28', '2018-12-31'],
      dtype='object', name='Date', length=1006)

In [98]:
# Read in data with date column as index, datetime format
df_resampling = pd.read_csv(url_resampling, index_col='Date', parse_dates=True)
df_resampling.index

DatetimeIndex(['2015-01-02', '2015-01-05', '2015-01-06', '2015-01-07',
               '2015-01-08', '2015-01-09', '2015-01-12', '2015-01-13',
               '2015-01-14', '2015-01-15',
               ...
               '2018-12-17', '2018-12-18', '2018-12-19', '2018-12-20',
               '2018-12-21', '2018-12-24', '2018-12-26', '2018-12-27',
               '2018-12-28', '2018-12-31'],
              dtype='datetime64[ns]', name='Date', length=1006, freq=None)