## 시계열 데이터의 기초 문법

### .now()
- 코드를 실행한 시점의 날짜와 시간을 불러온다.
- 출력 형태  
'0000(년)-00(월)-00(일) 00(시):00(분):00.000000(초)

In [1]:
## datatime

import datetime
now = datetime.datetime.now()
print(now)
display(now)

2024-07-04 15:32:32.273389


datetime.datetime(2024, 7, 4, 15, 32, 32, 273389)

### .datatime()
- 시계열 데이터를 만들 수 있다.  
- ex) sample_data = datetime.datetime(2024, 7, 7, 19, 59, 59)

In [2]:
sample_data = datetime.datetime(2024, 7, 7, 19, 59, 59) # 2024-07-07 19:59:59 (cf) 10주차 복습과제 데드라인)

In [3]:
print(sample_data)
display(sample_data)

2024-07-07 19:59:59


datetime.datetime(2024, 7, 7, 19, 59, 59)

In [4]:
print(type(sample_data))
print(type(now))

<class 'datetime.datetime'>
<class 'datetime.datetime'>


In [5]:
## datatime에서 연, 월, 일, 시간 등을 뽑아낼 수 있다.
sample_data.year

2024

In [6]:
sample_data.month

7

In [7]:
sample_data.day

7

In [8]:
sample_data.hour

19

### .today()
- 실행 시점의 연, 월, 일만을 불러온다.

In [9]:
datetime.date.today()

datetime.date(2024, 7, 4)

### .strftime()
- 날짜와 시간의 문자열을 반환해서 보여준다.
- ex) sample_data.strftime('%Y-%m-%d %H:%M:%S')

In [10]:
sample_data.strftime('%Y-%m-%d %H:%M:%S')

'2024-07-07 19:59:59'

In [11]:
# 원하는 부분만 골라서 할 수도 있다.
sample_data.strftime('%m-%d')

'07-07'

### .strptime()
- 시계열데이터를 만들 수 있게 해주는 함수이다.
- ex) sample_data_ob = datetime.datetime.strptime('2024-07-07 19:59:59', '%Y-%m-%d %H:%M:%S')

In [12]:
sample_data_ob = datetime.datetime.strptime('2024-07-07 19:59:59', '%Y-%m-%d %H:%M:%S')

In [13]:
print(sample_data_ob)
print(type(sample_data_ob))

2024-07-07 19:59:59
<class 'datetime.datetime'>


### .timedelta()
- 두 날짜 도는 시간의 차이를 나타낸다.
- ex) sample_data_gap = datetime.timedelta(days = 6, hours = 2)

In [14]:
sample_data_gap = datetime.timedelta(days = 6, hours = 2)

In [15]:
sample_data_gap

datetime.timedelta(days=6, seconds=7200)

In [16]:
# 미래
sample_data + sample_data_gap

datetime.datetime(2024, 7, 13, 21, 59, 59)

In [17]:
# 과거
sample_data - sample_data_gap

datetime.datetime(2024, 7, 1, 17, 59, 59)

### .weekday()
- 주말이나 평일 등을 계산 가능하다.
- 월요일 0 / 화요일 1 / 수요일 2 / 목요일 3 / 금요일 4 / 토요일 5 / 일요일 6

In [18]:
sample_data.weekday()

6

In [19]:
print(sample_data.weekday() >= 5) # 주말인지 확인할 수 있게해준다.
print(sample_data.weekday() < 5)

True
False


In [20]:
# timedelta를 적용시켜 다른 요일이 나오게 해보자.
sample_week_day = sample_data + datetime.timedelta(days=(7 - sample_data.weekday())%7) # days = 1이 된다.

In [21]:
print(sample_week_day)

2024-07-08 19:59:59


### relativedelta 라이브러리
- 현재 날짜로부터 n개월 후의 날짜 계산 / n년 전의 날짜 계산 / 다양한 단위의 혼합 사용 / 날짜 조작 등
- 상대적인 날짜를 계산할 때 유용하다.

In [22]:
from dateutil.relativedelta import relativedelta

In [23]:
# 현재 날짜의 6개월 후를 반환
sample_data + relativedelta(months = 6)

datetime.datetime(2025, 1, 7, 19, 59, 59)

In [24]:
# 현재 날짜의 6개월 전을 반환
sample_data - relativedelta(months = 6)

datetime.datetime(2024, 1, 7, 19, 59, 59)

In [25]:
# 현재 날짜 기준 2년 후
sample_data + relativedelta(years = 2)

datetime.datetime(2026, 7, 7, 19, 59, 59)

In [26]:
# 현재 날짜 기준 2년 전
sample_data - relativedelta(years = 2)

datetime.datetime(2022, 7, 7, 19, 59, 59)

In [27]:
# 판다스 라이브러리의 .data_range()를 사용하면 설정한 범위의 날짜를 출력할 수 있다.
import pandas as pd

# freq에 들어갈 인자를 설정하여 상대적인 날짜 출력을 지정할 수 있다. (D: 일, W: 주, M: 월, Y: 년)
sample_data_range = pd.date_range(start='2023-07-04', end='2024-07-04', freq=('M'))

In [28]:
sample_data_range

DatetimeIndex(['2023-07-31', '2023-08-31', '2023-09-30', '2023-10-31',
               '2023-11-30', '2023-12-31', '2024-01-31', '2024-02-29',
               '2024-03-31', '2024-04-30', '2024-05-31', '2024-06-30'],
              dtype='datetime64[ns]', freq='M')

### Calendar 라이브러리
- 달력 관련 작업을 수행하기 위한 다양한 기능을 제공하는 모듈
- 달력 출력 / 요일 계산 / 달력 설정 등이 가능하다

In [29]:
import calendar

cal_7 = calendar.month(2024, 7)

In [30]:
print(cal_7)

     July 2024
Mo Tu We Th Fr Sa Su
 1  2  3  4  5  6  7
 8  9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
29 30 31



### crime 데이터를 불러와서 시계열데이터로 조작해보자!

In [31]:
sample_df = pd.read_csv('crime.csv')

In [32]:
sample_df

Unnamed: 0.1,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,REPORTED_DATE,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
0,0,traffic-accident-dui-duid,traffic-accident,2014-06-29 02:01:00,-105.000149,39.745753,cbd,0,1
1,1,vehicular-eluding-no-chase,all-other-crimes,2014-06-29 01:54:00,-104.884660,39.738702,east-colfax,1,0
2,2,disturbing-the-peace,public-disorder,2014-06-29 02:00:00,-105.020719,39.706674,athmar-park,1,0
3,3,curfew,public-disorder,2014-06-29 02:18:00,-105.001552,39.769505,sunnyside,1,0
4,4,aggravated-assault,aggravated-assault,2014-06-29 04:17:00,-105.018557,39.679229,college-view-south-platte,1,0
...,...,...,...,...,...,...,...,...,...
460906,460906,burglary-business-by-force,burglary,2017-09-13 05:48:00,-105.033840,39.762365,west-highland,1,0
460907,460907,weapon-unlawful-discharge-of,all-other-crimes,2017-09-12 20:37:00,-105.040313,39.721264,barnum-west,1,0
460908,460908,traf-habitual-offender,all-other-crimes,2017-09-12 16:32:00,-104.847024,39.779596,montbello,1,0
460909,460909,criminal-mischief-other,public-disorder,2017-09-12 13:04:00,-104.949183,39.756353,skyland,1,0


In [33]:
# datetime 자료형이 시계열데이터의 문법을 적용 받는다.
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460911 entries, 0 to 460910
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           460911 non-null  int64  
 1   OFFENSE_TYPE_ID      460911 non-null  object 
 2   OFFENSE_CATEGORY_ID  460911 non-null  object 
 3   REPORTED_DATE        460911 non-null  object 
 4   GEO_LON              457296 non-null  float64
 5   GEO_LAT              457296 non-null  float64
 6   NEIGHBORHOOD_ID      460911 non-null  object 
 7   IS_CRIME             460911 non-null  int64  
 8   IS_TRAFFIC           460911 non-null  int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 31.6+ MB


In [34]:
sample_df_copy = sample_df.copy() 

### pd.to_date_time()
- 문자열을 날짜 / 시간 객체로 변환해준다.

In [35]:
# REPORTED_DATE의 칼럼이 문자열로 구성되어 있었는데, 이를 날짜/시간 형식으로 바꿔준다.
sample_df_copy['REPORTED_DATE'] = pd.to_datetime(sample_df_copy['REPORTED_DATE'])

In [36]:
sample_df_copy.info() # Dtype이 datetime64[ns]로 바뀌었다.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460911 entries, 0 to 460910
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Unnamed: 0           460911 non-null  int64         
 1   OFFENSE_TYPE_ID      460911 non-null  object        
 2   OFFENSE_CATEGORY_ID  460911 non-null  object        
 3   REPORTED_DATE        460911 non-null  datetime64[ns]
 4   GEO_LON              457296 non-null  float64       
 5   GEO_LAT              457296 non-null  float64       
 6   NEIGHBORHOOD_ID      460911 non-null  object        
 7   IS_CRIME             460911 non-null  int64         
 8   IS_TRAFFIC           460911 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 31.6+ MB


In [37]:
display(sample_df['REPORTED_DATE'])
display(sample_df_copy['REPORTED_DATE'])

0         2014-06-29 02:01:00
1         2014-06-29 01:54:00
2         2014-06-29 02:00:00
3         2014-06-29 02:18:00
4         2014-06-29 04:17:00
                 ...         
460906    2017-09-13 05:48:00
460907    2017-09-12 20:37:00
460908    2017-09-12 16:32:00
460909    2017-09-12 13:04:00
460910    2017-09-12 09:30:00
Name: REPORTED_DATE, Length: 460911, dtype: object

0        2014-06-29 02:01:00
1        2014-06-29 01:54:00
2        2014-06-29 02:00:00
3        2014-06-29 02:18:00
4        2014-06-29 04:17:00
                 ...        
460906   2017-09-13 05:48:00
460907   2017-09-12 20:37:00
460908   2017-09-12 16:32:00
460909   2017-09-12 13:04:00
460910   2017-09-12 09:30:00
Name: REPORTED_DATE, Length: 460911, dtype: datetime64[ns]

### set_index()
- 앞선 데이터를 시계열 데이터처럼 사용하기 위해서는 한 번 더 가공이 필요하다.
- 위 함수를 사용하면 시계열 데이터를 인덱스로 변환해준다.

In [38]:
sample_df_copy = sample_df_copy.set_index('REPORTED_DATE') # 이렇게 만들면 자유롭게 시계열데이터 문법을 사용할 수 있다.

In [39]:
sample_df_copy.info() # 컬럼 목록에서 없어졌다. 대신에 인덱스로 변환되었다.

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 460911 entries, 2014-06-29 02:01:00 to 2017-09-12 09:30:00
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           460911 non-null  int64  
 1   OFFENSE_TYPE_ID      460911 non-null  object 
 2   OFFENSE_CATEGORY_ID  460911 non-null  object 
 3   GEO_LON              457296 non-null  float64
 4   GEO_LAT              457296 non-null  float64
 5   NEIGHBORHOOD_ID      460911 non-null  object 
 6   IS_CRIME             460911 non-null  int64  
 7   IS_TRAFFIC           460911 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 31.6+ MB


In [40]:
# loc를 사용하여 인덱스로 변환한 시계열 데이터를 이용할 수 있다.
sample_df_copy.loc['2017']

Unnamed: 0_level_0,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-05-22 14:41:00,7261,traffic-accident,traffic-accident,-104.673812,39.849292,dia,0,1
2017-05-17 20:35:00,7262,threats-to-injure,public-disorder,-105.020053,39.694351,ruby-hill,1,0
2017-06-07 07:47:00,7265,burglary-residence-by-force,burglary,-104.981677,39.763597,five-points,1,0
2017-05-26 16:46:00,7270,theft-other,larceny,-104.839119,39.769694,stapleton,1,0
2017-06-07 07:42:00,7272,criminal-trespassing,all-other-crimes,-104.673812,39.849292,dia,1,0
...,...,...,...,...,...,...,...,...
2017-09-13 05:48:00,460906,burglary-business-by-force,burglary,-105.033840,39.762365,west-highland,1,0
2017-09-12 20:37:00,460907,weapon-unlawful-discharge-of,all-other-crimes,-105.040313,39.721264,barnum-west,1,0
2017-09-12 16:32:00,460908,traf-habitual-offender,all-other-crimes,-104.847024,39.779596,montbello,1,0
2017-09-12 13:04:00,460909,criminal-mischief-other,public-disorder,-104.949183,39.756353,skyland,1,0


In [41]:
sample_df_copy.loc['2017-01-01']

Unnamed: 0_level_0,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-01 02:26:00,37971,assault-dv,other-crimes-against-persons,-104.991476,39.751538,cbd,1,0
2017-01-01 19:37:00,343343,traf-other,all-other-crimes,-104.950485,39.736722,congress-park,1,0
2017-01-01 03:25:00,343696,theft-of-motor-vehicle,auto-theft,-105.001424,39.746412,auraria,1,0
2017-01-01 02:47:00,343785,traf-other,all-other-crimes,-105.024826,39.764654,highland,1,0
2017-01-01 10:46:00,343820,criminal-mischief-mtr-veh,public-disorder,-105.041320,39.741273,west-colfax,1,0
...,...,...,...,...,...,...,...,...
2017-01-01 11:50:00,379037,weapon-fire-into-occ-bldg,aggravated-assault,-104.740735,39.797687,gateway-green-valley-ranch,1,0
2017-01-01 12:32:00,379195,traffic-accident,traffic-accident,-104.925797,39.743733,south-park-hill,0,1
2017-01-01 16:23:00,379207,drug-poss-paraphernalia,drug-alcohol,-104.890551,39.740155,east-colfax,1,0
2017-01-01 15:20:00,379274,assault-dv,other-crimes-against-persons,-104.976711,39.722146,speer,1,0


In [42]:
sample_df_copy.loc['Jan 2017']

Unnamed: 0_level_0,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-15 00:46:00,11473,theft-other,larceny,-104.976756,39.762913,five-points,1,0
2017-01-27 19:20:00,11837,assault-dv,other-crimes-against-persons,-104.966331,39.745437,city-park-west,1,0
2017-01-23 05:12:00,11882,assault-dv,other-crimes-against-persons,-104.978988,39.748799,five-points,1,0
2017-01-05 20:03:00,13521,criminal-mischief-mtr-veh,public-disorder,-104.968347,39.777425,elyria-swansea,1,0
2017-01-14 14:32:00,13528,assault-dv,other-crimes-against-persons,-104.929743,39.764812,northeast-park-hill,1,0
...,...,...,...,...,...,...,...,...
2017-01-13 01:06:00,394421,assault-simple,other-crimes-against-persons,-104.925197,39.678463,goldsmith,1,0
2017-01-11 15:04:00,402156,criminal-mischief-mtr-veh,public-disorder,-104.864705,39.758337,stapleton,1,0
2017-01-14 17:29:00,403548,assault-dv,other-crimes-against-persons,-104.857357,39.659293,kennedy,1,0
2017-01-24 10:55:00,422215,fraud-identity-theft,white-collar-crime,-104.917871,39.675902,goldsmith,1,0


In [43]:
# .sort_index()를 사용하여 인덱스를 정렬할 수 있다.
sample_df_copy.loc['Jan 2017'].sort_index()

Unnamed: 0_level_0,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-01 00:16:00,344311,weapon-unlawful-discharge-of,all-other-crimes,-104.943494,39.769454,clayton,1,0
2017-01-01 00:27:00,344762,theft-other,larceny,-104.994496,39.747944,cbd,1,0
2017-01-01 01:16:00,370148,weapon-unlawful-discharge-of,all-other-crimes,-104.838218,39.794725,montbello,1,0
2017-01-01 01:16:00,370737,assault-simple,other-crimes-against-persons,-104.991384,39.753888,five-points,1,0
2017-01-01 01:20:00,345896,traf-other,all-other-crimes,-104.961459,39.772793,cole,1,0
...,...,...,...,...,...,...,...,...
2017-01-31 23:19:00,175254,theft-parts-from-vehicle,theft-from-motor-vehicle,-104.979001,39.739775,capitol-hill,1,0
2017-01-31 23:38:00,173734,traffic-accident,traffic-accident,-104.953421,39.720975,cherry-creek,0,1
2017-01-31 23:45:00,173395,traf-other,all-other-crimes,-104.985900,39.694651,platt-park,1,0
2017-01-31 23:54:00,174264,theft-items-from-vehicle,theft-from-motor-vehicle,-104.974378,39.720900,speer,1,0


### .between_time()
- 해당 함수는 시간 인덱스를 기반으로 하는 데이터프레임이나 시리즈에서 특정 시간 범위 내의 데이터를 선택하는데 사용한다.

In [44]:
sample_df_copy.between_time('01:00', '03:00') # 시작 날짜와 끝 날짜 중 01:00 ~ 03:00 사이의 데이터를 출력한다.

Unnamed: 0_level_0,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-06-29 02:01:00,0,traffic-accident-dui-duid,traffic-accident,-105.000149,39.745753,cbd,0,1
2014-06-29 01:54:00,1,vehicular-eluding-no-chase,all-other-crimes,-104.884660,39.738702,east-colfax,1,0
2014-06-29 02:00:00,2,disturbing-the-peace,public-disorder,-105.020719,39.706674,athmar-park,1,0
2014-06-29 02:18:00,3,curfew,public-disorder,-105.001552,39.769505,sunnyside,1,0
2014-06-29 02:56:00,6,traffic-accident-dui-duid,traffic-accident,-105.052956,39.733315,villa-park,0,1
...,...,...,...,...,...,...,...,...
2017-09-22 01:30:00,460774,drug-methampetamine-sell,drug-alcohol,-105.025080,39.699230,westwood,1,0
2017-08-28 01:07:00,460852,traf-other,all-other-crimes,-105.011016,39.696419,ruby-hill,1,0
2017-09-13 02:21:00,460867,assault-simple,other-crimes-against-persons,-104.925733,39.654184,university-hills,1,0
2017-09-13 02:15:00,460889,traffic-accident-hit-and-run,traffic-accident,-105.043950,39.787436,regis,0,1


### .at_time()
- 해당 함수는 특정 시간에 해당하는 행들을 필터링하는데 사용한다.

In [45]:
sample_df_copy.at_time('15:30')

Unnamed: 0_level_0,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-05-31 15:30:00,684,theft-shoplift,larceny,-105.021159,39.677518,college-view-south-platte,1,0
2013-12-06 15:30:00,1648,theft-items-from-vehicle,theft-from-motor-vehicle,-104.951523,39.725465,cherry-creek,1,0
2015-12-28 15:30:00,1780,traffic-accident-hit-and-run,traffic-accident,-104.931654,39.780147,northeast-park-hill,0,1
2014-03-01 15:30:00,2264,criminal-mischief-other,public-disorder,-104.992029,39.735009,civic-center,1,0
2012-09-17 15:30:00,2806,weapon-poss-illegal-dangerous,all-other-crimes,-105.022905,39.759878,highland,1,0
...,...,...,...,...,...,...,...,...
2017-09-02 15:30:00,456302,criminal-mischief-other,public-disorder,-105.024339,39.712048,valverde,1,0
2017-09-21 15:30:00,458010,burglary-residence-no-force,burglary,-104.903121,39.713464,lowry-field,1,0
2017-09-21 15:30:00,458637,assault-dv,other-crimes-against-persons,-104.969944,39.762913,cole,1,0
2017-08-25 15:30:00,459498,traffic-accident,traffic-accident,-104.987535,39.701456,baker,0,1


### .resample()
- Datetime Index를 원하는 주기로 나누어주는 메서드

In [46]:
sample_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 460911 entries, 2014-06-29 02:01:00 to 2017-09-12 09:30:00
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           460911 non-null  int64  
 1   OFFENSE_TYPE_ID      460911 non-null  object 
 2   OFFENSE_CATEGORY_ID  460911 non-null  object 
 3   GEO_LON              457296 non-null  float64
 4   GEO_LAT              457296 non-null  float64
 5   NEIGHBORHOOD_ID      460911 non-null  object 
 6   IS_CRIME             460911 non-null  int64  
 7   IS_TRAFFIC           460911 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 31.6+ MB


In [47]:
# IS_TRAFFIC 범죄 칼럼에 대해 확인해보자.
df_crime = sample_df_copy[['IS_TRAFFIC']]

In [48]:
df_crime.resample('Y').sum()

Unnamed: 0_level_0,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1
2012-12-31,19786
2013-12-31,18862
2014-12-31,21763
2015-12-31,23310
2016-12-31,23744
2017-12-31,17836


In [49]:
df_crime.resample('M').sum()

Unnamed: 0_level_0,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1
2012-01-31,1569
2012-02-29,1629
2012-03-31,1528
2012-04-30,1595
2012-05-31,1831
...,...
2017-05-31,2203
2017-06-30,2076
2017-07-31,2039
2017-08-31,2131


In [50]:
df_crime.resample('W').sum()

Unnamed: 0_level_0,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1
2012-01-08,326
2012-01-15,471
2012-01-22,337
2012-01-29,333
2012-02-05,352
...,...
2017-09-03,501
2017-09-10,425
2017-09-17,503
2017-09-24,508


In [51]:
# 원본 데이터에서도 추출할 수 있다.
sample_df_copy['IS_TRAFFIC'].resample('Y').sum()

REPORTED_DATE
2012-12-31    19786
2013-12-31    18862
2014-12-31    21763
2015-12-31    23310
2016-12-31    23744
2017-12-31    17836
Freq: A-DEC, Name: IS_TRAFFIC, dtype: int64

In [52]:
sample_df_copy['IS_TRAFFIC'].resample('Y').mean()

REPORTED_DATE
2012-12-31    0.346898
2013-12-31    0.271290
2014-12-31    0.257791
2015-12-31    0.261475
2016-12-31    0.260705
2017-12-31    0.255889
Freq: A-DEC, Name: IS_TRAFFIC, dtype: float64

In [53]:
sample_df_copy[['IS_TRAFFIC', 'IS_CRIME']].resample('M').sum()

Unnamed: 0_level_0,IS_TRAFFIC,IS_CRIME
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-31,1569,2660
2012-02-29,1629,2353
2012-03-31,1528,2869
2012-04-30,1595,3070
2012-05-31,1831,3321
...,...,...
2017-05-31,2203,5965
2017-06-30,2076,5972
2017-07-31,2039,6005
2017-08-31,2131,6568


### .shift()
- 지정된 숫자만큼의 행수를 이동시킨다.
- 차분(differencing)을 계산하거나 시계열 예측 모델에서 지연된(lagged) 변수를 생성하는데 유용하다.

In [54]:
display(sample_df_copy)
display(sample_df_copy.shift(20))
# 결과를 보면 처음 20개부터 이동하였고 기존에 앞 순서의 20개의 인덱스의 값은 NaN으로 표시되었다. 또한 Unnamed 컬럼을 참고하면 마지막 행의 값이 460910에서 460890으로 바뀐 것을 확인할 수 있다.

Unnamed: 0_level_0,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-06-29 02:01:00,0,traffic-accident-dui-duid,traffic-accident,-105.000149,39.745753,cbd,0,1
2014-06-29 01:54:00,1,vehicular-eluding-no-chase,all-other-crimes,-104.884660,39.738702,east-colfax,1,0
2014-06-29 02:00:00,2,disturbing-the-peace,public-disorder,-105.020719,39.706674,athmar-park,1,0
2014-06-29 02:18:00,3,curfew,public-disorder,-105.001552,39.769505,sunnyside,1,0
2014-06-29 04:17:00,4,aggravated-assault,aggravated-assault,-105.018557,39.679229,college-view-south-platte,1,0
...,...,...,...,...,...,...,...,...
2017-09-13 05:48:00,460906,burglary-business-by-force,burglary,-105.033840,39.762365,west-highland,1,0
2017-09-12 20:37:00,460907,weapon-unlawful-discharge-of,all-other-crimes,-105.040313,39.721264,barnum-west,1,0
2017-09-12 16:32:00,460908,traf-habitual-offender,all-other-crimes,-104.847024,39.779596,montbello,1,0
2017-09-12 13:04:00,460909,criminal-mischief-other,public-disorder,-104.949183,39.756353,skyland,1,0


Unnamed: 0_level_0,Unnamed: 0,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
REPORTED_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-06-29 02:01:00,,,,,,,,
2014-06-29 01:54:00,,,,,,,,
2014-06-29 02:00:00,,,,,,,,
2014-06-29 02:18:00,,,,,,,,
2014-06-29 04:17:00,,,,,,,,
...,...,...,...,...,...,...,...,...
2017-09-13 05:48:00,460886.0,traffic-accident,traffic-accident,-104.924021,39.738229,hale,0.0,1.0
2017-09-12 20:37:00,460887.0,traffic-accident,traffic-accident,-104.912874,39.667630,hampden,0.0,1.0
2017-09-12 16:32:00,460888.0,theft-items-from-vehicle,theft-from-motor-vehicle,-104.981910,39.731684,capitol-hill,1.0,0.0
2017-09-12 13:04:00,460889.0,traffic-accident-hit-and-run,traffic-accident,-105.043950,39.787436,regis,0.0,1.0


### .rolling()
- 시계열 데이터의 이동 윈도우(rolling window)를 계산하는 데 사용
- 이동 평균, 이동 합계, 이동 표준편차 등과 같은 다양한 이동 통계량을 계산할 때 유용

In [55]:
# 예시를 간단하게 들어보자
data = {'Value' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
df = pd.DataFrame(data)
# 윈도우 크기 3으로 '이동 평균', '이동 합계', '이동 표준편차'를 구해보자.
data_mean = df['Value'].rolling(window = 3).mean()
data_sum = df['Value'].rolling(window = 3).sum()
data_std = df['Value'].rolling(window = 3).std()

In [56]:
print(data_mean)
# 2번 인덱스 (1 + 2 + 3) / 3
# 3번 인덱스 (2 + 3 + 4) / 3
# 4번 인덱스 (3 + 4 + 5) / 3

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
Name: Value, dtype: float64


In [57]:
print(data_sum)
# 2번 인덱스 1 + 2 + 3 = 6
# 3번 인덱스 2 + 3 + 4 = 9
# 4번 인덱스 3 + 4 + 5 = 12

0     NaN
1     NaN
2     6.0
3     9.0
4    12.0
5    15.0
6    18.0
7    21.0
8    24.0
9    27.0
Name: Value, dtype: float64


In [58]:
print(data_std)
# 분산이 1이기 때문에 표준편차들도 1이다.
# 표준편차는 분산의 제곱근이다.

0    NaN
1    NaN
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
7    1.0
8    1.0
9    1.0
Name: Value, dtype: float64


In [59]:
# IS_TRAFFIC에 대한 이동평균을 구해보자.
sample_df_copy['IS_TRAFFIC'].rolling(window = 50).mean()

REPORTED_DATE
2014-06-29 02:01:00     NaN
2014-06-29 01:54:00     NaN
2014-06-29 02:00:00     NaN
2014-06-29 02:18:00     NaN
2014-06-29 04:17:00     NaN
                       ... 
2017-09-13 05:48:00    0.36
2017-09-12 20:37:00    0.36
2017-09-12 16:32:00    0.36
2017-09-12 13:04:00    0.34
2017-09-12 09:30:00    0.34
Name: IS_TRAFFIC, Length: 460911, dtype: float64