# Converting string to datetime

In [8]:
import numpy as np
import pandas as pd
date_string=np.array(['03-04-2005 11:40 PM','05-04-2006 2:10 AM','13-07-2007 3:20 AM','15-11-2009 3:10 PM'])
date_string

array(['03-04-2005 11:40 PM', '05-04-2006 2:10 AM', '13-07-2007 3:20 AM',
       '15-11-2009 3:10 PM'], dtype='<U19')

In [9]:
ts=[pd.to_datetime(string,format='%d-%m-%Y %I:%M %p') for string in date_string]

In [10]:
ts

[Timestamp('2005-04-03 23:40:00'),
 Timestamp('2006-04-05 02:10:00'),
 Timestamp('2007-07-13 03:20:00'),
 Timestamp('2009-11-15 15:10:00')]

if error='coerce' ,then any problem occur that will not raise an error(the default behaviour) but instead will set the value causing that error to Nat

In [16]:
#example is given below
date_strings=np.array(['03-04-2005 10:40 PM','05-04-2006 2:10 AM','13-07-2007 3:10 PM','13-06-2014 4:20 PM','15.11.2009 10:20 AM'])
q=[pd.to_datetime(string,format='%d-%m-%Y %I:%M %p',errors='coerce') for string in date_strings]
q

[Timestamp('2005-04-03 22:40:00'),
 Timestamp('2006-04-05 02:10:00'),
 Timestamp('2007-07-13 15:10:00'),
 Timestamp('2014-06-13 16:20:00'),
 NaT]

# Handling time zone

In [17]:
pd.Timestamp('2017-05-01 06:00:00',tz='Europe/London')

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [19]:
#we can add timezone previously created time zone
date=pd.Timestamp('2016-06-01 06:00:00')
date

Timestamp('2016-06-01 06:00:00')

In [22]:
#to add time zone
date_in_london=date.tz_localize('Europe/London')
date_in_london

Timestamp('2016-06-01 06:00:00+0100', tz='Europe/London')

In [23]:
#we can also convert to another time zone
date_in_london.tz_convert('America/New_York')

Timestamp('2016-06-01 01:00:00-0400', tz='America/New_York')

In [25]:
#finally pandas's series object can apply tz_localize and tz_convert to every element
ts=pd.Series(pd.date_range('2015-12-03',periods=3,freq='M'))
ts

0   2015-12-31
1   2016-01-31
2   2016-02-29
dtype: datetime64[ns]

In [29]:
tsa=ts.dt.tz_localize('America/New_York')
tsa

0   2015-12-31 00:00:00-05:00
1   2016-01-31 00:00:00-05:00
2   2016-02-29 00:00:00-05:00
dtype: datetime64[ns, America/New_York]

In [30]:
tsa.dt.tz_convert('Europe/London')

0   2015-12-31 05:00:00+00:00
1   2016-01-31 05:00:00+00:00
2   2016-02-29 05:00:00+00:00
dtype: datetime64[ns, Europe/London]

In [32]:
from pytz import all_timezones
all_timezones[:5]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara']

# Creating a random date time dataframe

In [36]:
#creating a dataframe
df1=pd.DataFrame(np.random.randint(5,10,5),index=pd.date_range('2014-10-10',periods=5,freq='D'),columns=['date'])
df1

Unnamed: 0,date
2014-10-10,7
2014-10-11,8
2014-10-12,6
2014-10-13,6
2014-10-14,5


# Secting Over date time Dataframe

In [38]:
#creating a time dataframe
df2=pd.DataFrame()
df2['Date']=pd.date_range('2011-10-10',periods=10000,freq='H')
df2.head(5)

Unnamed: 0,Date
0,2011-10-10 00:00:00
1,2011-10-10 01:00:00
2,2011-10-10 02:00:00
3,2011-10-10 03:00:00
4,2011-10-10 04:00:00


In [40]:
#selecting time
df2[(df2["Date"]>'2011-10-10 01:00:00') & (df2['Date']<='2011-10-10 04:00:00')]

Unnamed: 0,Date
2,2011-10-10 02:00:00
3,2011-10-10 03:00:00
4,2011-10-10 04:00:00


In [43]:
#Alternatively we can set Date column as DataFrame's Index
df3=df2.set_index('Date')
df3.head(3)

2011-10-10 00:00:00
2011-10-10 01:00:00
2011-10-10 02:00:00


In [45]:
df3=df2.set_index(df2['Date'])
df3.head(5)

Unnamed: 0_level_0,Date
Date,Unnamed: 1_level_1
2011-10-10 00:00:00,2011-10-10 00:00:00
2011-10-10 01:00:00,2011-10-10 01:00:00
2011-10-10 02:00:00,2011-10-10 02:00:00
2011-10-10 03:00:00,2011-10-10 03:00:00
2011-10-10 04:00:00,2011-10-10 04:00:00


In [46]:
#now we can select
df3.loc['2011-10-10 01:00:00':'2011-10-10 04:00:00']

Unnamed: 0_level_0,Date
Date,Unnamed: 1_level_1
2011-10-10 01:00:00,2011-10-10 01:00:00
2011-10-10 02:00:00,2011-10-10 02:00:00
2011-10-10 03:00:00,2011-10-10 03:00:00
2011-10-10 04:00:00,2011-10-10 04:00:00


# Breaking up date data into multiple feature

In [47]:
dataframe=pd.DataFrame()
dataframe['date']=pd.date_range('2017-01-02',periods=100,freq='M')
dataframe.head()

Unnamed: 0,date
0,2017-01-31
1,2017-02-28
2,2017-03-31
3,2017-04-30
4,2017-05-31


In [48]:
dataframe['Year']=dataframe['date'].dt.year
dataframe['Month']=dataframe['date'].dt.month
dataframe['Day']=dataframe['date'].dt.day
dataframe['Minute']=dataframe['date'].dt.hour
dataframe['Hour']=dataframe['date'].dt.hour

In [49]:
dataframe.head(5)

Unnamed: 0,date,Year,Month,Day,Minute,Hour
0,2017-01-31,2017,1,31,0,0
1,2017-02-28,2017,2,28,0,0
2,2017-03-31,2017,3,31,0,0
3,2017-04-30,2017,4,30,0,0
4,2017-05-31,2017,5,31,0,0


# Calculating difference between dates

In [52]:
dataframe=pd.DataFrame()
dataframe['Left']=[pd.Timestamp('2017-11-01'),pd.Timestamp('2017-10-01')]
dataframe['Arrived']=[pd.Timestamp('2017-11-01'),pd.Timestamp('2017-11-15')]
dataframe

Unnamed: 0,Left,Arrived
0,2017-11-01,2017-11-01
1,2017-10-01,2017-11-15


In [53]:
#difference of day two Arrived and Left
dataframe['Arrived']-dataframe['Left']

0    0 days
1   45 days
dtype: timedelta64[ns]

In [54]:
#difference between two date
pd.Timestamp('2017-10-10')-pd.Timestamp('2017-10-02')

Timedelta('8 days 00:00:00')

In [57]:
#if you want to removed days
p=[delta.days for delta in (dataframe['Arrived']-dataframe['Left'])]
p

[0, 45]

In [59]:
pd.Series(p)

0     0
1    45
dtype: int64

# Encoding days of the week

In [7]:
import pandas as pd
dates = pd.Series(pd.date_range("2/2/2002", periods=3, freq="M"))
dates

0   2002-02-28
1   2002-03-31
2   2002-04-30
dtype: datetime64[ns]

In [11]:
#to find weekday of a series
dates.dt.weekday

0    3
1    6
2    1
dtype: int64

In [22]:
#to find week day name of a series
dates.dt.strftime('%A')

0    Thursday
1      Sunday
2     Tuesday
dtype: object

In [20]:
pd.Timestamp('2002-04-30').weekday()

1

In [21]:
pd.Timestamp('2002-04-30').strftime('%A')

'Tuesday'

In [28]:
#to find day name of today
from datetime import datetime as dt
dt.today().strftime('%A')

'Wednesday'

In [30]:
#create a dataframe
df=pd.DataFrame(pd.date_range('2017-11-20',periods=5,freq='M'),columns=['Date'])
df

Unnamed: 0,Date
0,2017-11-30
1,2017-12-31
2,2018-01-31
3,2018-02-28
4,2018-03-31


In [31]:
#to find weekday name of a dataframe
df['Date'].dt.strftime('%A')

0     Thursday
1       Sunday
2    Wednesday
3    Wednesday
4     Saturday
Name: Date, dtype: object

In [32]:
#to find weekday of a dataframe
df['Date'].dt.weekday

0    3
1    6
2    2
3    2
4    5
Name: Date, dtype: int64

# Creating a lagged feature

In [35]:
import numpy as np
df=pd.DataFrame({'stockprice':np.random.randint(1,4,7),'date':pd.date_range('2007-07-06',periods=7,freq='D')})
df

Unnamed: 0,stockprice,date
0,3,2007-07-06
1,2,2007-07-07
2,2,2007-07-08
3,3,2007-07-09
4,1,2007-07-10
5,3,2007-07-11
6,1,2007-07-12


In [37]:
df['Previous day stock price']=df['stockprice'].shift(1)

In [38]:
df

Unnamed: 0,stockprice,date,Previous day stock price
0,3,2007-07-06,
1,2,2007-07-07,3.0
2,2,2007-07-08,2.0
3,3,2007-07-09,2.0
4,1,2007-07-10,3.0
5,3,2007-07-11,1.0
6,1,2007-07-12,3.0


# Use Rolling Time Windows

In [42]:
df=pd.DataFrame({'stockprice':np.random.randint(1,4,6)},index=pd.date_range('2005-06-02',periods=6,freq='D'))
df

Unnamed: 0,stockprice
2005-06-02,3
2005-06-03,1
2005-06-04,1
2005-06-05,2
2005-06-06,2
2005-06-07,2


In [44]:
#calculate rolling window mean
df['stockprice'].rolling(window=2).mean()

2005-06-02    NaN
2005-06-03    2.0
2005-06-04    1.0
2005-06-05    1.5
2005-06-06    2.0
2005-06-07    2.0
Freq: D, Name: stockprice, dtype: float64

In [48]:
df=pd.DataFrame({'stockprice':np.random.randint(1,4,6)},index=pd.date_range('2001-01-01',periods=6,freq='3M'))
df

Unnamed: 0,stockprice
2001-01-31,1
2001-04-30,2
2001-07-31,3
2001-10-31,1
2002-01-31,2
2002-04-30,2


In [49]:
df['stockprice'].rolling(3).mean()

2001-01-31         NaN
2001-04-30         NaN
2001-07-31    2.000000
2001-10-31    2.000000
2002-01-31    2.000000
2002-04-30    1.666667
Freq: 3M, Name: stockprice, dtype: float64

# Handling Missing Value in Timeseries

In [51]:
df=pd.DataFrame({'stockprice':[1,2,np.nan,np.nan,np.nan,5]},index=pd.date_range('2001-02-03',periods=6,freq='D'))
df

Unnamed: 0,stockprice
2001-02-03,1.0
2001-02-04,2.0
2001-02-05,
2001-02-06,
2001-02-07,
2001-02-08,5.0


    interpolate work as: (5-2)/4=.75
    it add .75 to the last apear  non-nan value and continue to do that accross nan value 

In [52]:
df['stockprice'].interpolate()

2001-02-03    1.00
2001-02-04    2.00
2001-02-05    2.75
2001-02-06    3.50
2001-02-07    4.25
2001-02-08    5.00
Freq: D, Name: stockprice, dtype: float64

In [53]:
#we can alternatively use ffill
df['stockprice'].ffill()

2001-02-03    1.0
2001-02-04    2.0
2001-02-05    2.0
2001-02-06    2.0
2001-02-07    2.0
2001-02-08    5.0
Freq: D, Name: stockprice, dtype: float64

In [54]:
#we can also use backfill
df['stockprice'].bfill()

2001-02-03    1.0
2001-02-04    2.0
2001-02-05    5.0
2001-02-06    5.0
2001-02-07    5.0
2001-02-08    5.0
Freq: D, Name: stockprice, dtype: float64

In [55]:
df['stockprice'].interpolate(method='quadratic')

2001-02-03    1.0
2001-02-04    2.0
2001-02-05    2.9
2001-02-06    3.7
2001-02-07    4.4
2001-02-08    5.0
Freq: D, Name: stockprice, dtype: float64

In [56]:
df['stockprice'].interpolate(limit=2,limit_direction='forward')

2001-02-03    1.00
2001-02-04    2.00
2001-02-05    2.75
2001-02-06    3.50
2001-02-07     NaN
2001-02-08    5.00
Freq: D, Name: stockprice, dtype: float64

In [57]:
df['stockprice'].interpolate(limit=2,limit_direction='backward')

2001-02-03    1.00
2001-02-04    2.00
2001-02-05     NaN
2001-02-06    3.50
2001-02-07    4.25
2001-02-08    5.00
Freq: D, Name: stockprice, dtype: float64

In [58]:
df['stockprice'].ffill(limit=2)

2001-02-03    1.0
2001-02-04    2.0
2001-02-05    2.0
2001-02-06    2.0
2001-02-07    NaN
2001-02-08    5.0
Freq: D, Name: stockprice, dtype: float64