#### Topics Covered today
- Date Functionality and Time Delta
- Categorical Data
- Sparse Data


In [None]:
import pandas as pd

1a. Creating a Date Column

In [None]:
print(pd.date_range('5/1/2011', periods=10))

DatetimeIndex(['2011-05-01', '2011-05-02', '2011-05-03', '2011-05-04',
               '2011-05-05', '2011-05-06', '2011-05-07', '2011-05-08',
               '2011-05-09', '2011-05-10'],
              dtype='datetime64[ns]', freq='D')


1b. Adding Frequency

In [None]:
print(pd.date_range('1/1/2011', periods=5, freq='M'))

DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31', '2011-04-30',
               '2011-05-31'],
              dtype='datetime64[ns]', freq='M')


1c. Other Frequency Options

In [None]:
print(pd.date_range('1/1/2011', periods=5, freq='MS'))
# Show all the Frequency Options from Pandas

DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01',
               '2011-05-01'],
              dtype='datetime64[ns]', freq='MS')


2a. Time Delta

In [None]:
day1 = pd.to_datetime('today')
day2 = day1 + pd.Timedelta('1 day')
print("Day 1:", day1)
print("Day 2:", day2, day2.day_name())

Day 1: 2020-10-03 16:26:54.744401
Day 2: 2020-10-04 16:26:54.744401 Sunday


2b. Date Operations

In [None]:
date = pd.Series(pd.date_range('2020-1-1', periods=7, freq='D'))
to_be_added = pd.Series([pd.Timedelta(days=i) for i in range(7)])
date_df = pd.DataFrame({'Date': date, 'To_Add': to_be_added})
print(date_df)
date_df['Final_Date'] = date_df['Date'] + date_df['To_Add']
print(date_df) 

NameError: ignored

In [None]:
date = pd.Series(pd.date_range('2020-1-1', periods=7, freq='D'))
to_be_added = pd.Series([pd.Timedelta(days=i) for i in range(7)])
date_df = pd.DataFrame({'Date': date, 'To_Add': to_be_added})
print(date_df)
date_df['Final_Date'] = date_df['Date'] - date_df['To_Add']
print(date_df)

        Date To_Add
0 2020-01-01 0 days
1 2020-01-02 1 days
2 2020-01-03 2 days
3 2020-01-04 3 days
4 2020-01-05 4 days
5 2020-01-06 5 days
6 2020-01-07 6 days
        Date To_Add Final_Date
0 2020-01-01 0 days 2020-01-01
1 2020-01-02 1 days 2020-01-01
2 2020-01-03 2 days 2020-01-01
3 2020-01-04 3 days 2020-01-01
4 2020-01-05 4 days 2020-01-01
5 2020-01-06 5 days 2020-01-01
6 2020-01-07 6 days 2020-01-01


In [None]:
date = pd.Series(pd.date_range('2020-1-1', periods=7, freq='D'))
to_be_added = pd.Series([pd.Timedelta(days=i) for i in range(7)])
date_df['year'] = date_df['Date'].dt.year
date_df['month'] = date_df['Date'].dt.month
date_df['day'] = date_df['Date'].dt.day
print(date_df)

        Date To_Add Final_Date  year  month  day
0 2020-01-01 0 days 2020-01-01  2020      1    1
1 2020-01-02 1 days 2020-01-01  2020      1    2
2 2020-01-03 2 days 2020-01-01  2020      1    3
3 2020-01-04 3 days 2020-01-01  2020      1    4
4 2020-01-05 4 days 2020-01-01  2020      1    5
5 2020-01-06 5 days 2020-01-01  2020      1    6
6 2020-01-07 6 days 2020-01-01  2020      1    7


More date operation Here - https://pandas.pydata.org/pandas-docs/stable/user_guide/timedeltas.html

3a. Categorical Data

In [None]:
gender = pd.Series(["Male","Female","Male","Female", "Female"], dtype="category")
print(gender)

0      Male
1    Female
2      Male
3    Female
4    Female
dtype: category
Categories (2, object): ['Female', 'Male']


3b. Get all categories

In [None]:
Categories = pd.Categorical(gender)
print(Categories)


['Male', 'Female', 'Male', 'Female', 'Female']
Categories (2, object): ['Female', 'Male']


3c. Accessing the objects from the categories

In [None]:
print("Categories:", Categories[0], "and", Categories[1])

Categories: Male and Female


3d. Summary on the Categrical attribute

In [None]:
print(gender.describe())

count          5
unique         2
top       Female
freq           3
dtype: object


3e. Removing certain categories

In [None]:
print(gender.cat.remove_categories("Male"))

0       NaN
1    Female
2       NaN
3    Female
4    Female
dtype: category
Categories (1, object): ['Female']


4a. Sparse Data - Useful when there is a large number of zero elements

In [None]:
import pandas as pd
import numpy as np


df = pd.DataFrame(np.random.randn(10000, 4))
df.iloc[:9998] = np.nan
sdf = df.astype(pd.SparseDtype("float", np.nan))
print(type(sdf))

<class 'pandas.core.frame.DataFrame'>


In [None]:
sdf.dtypes

0    Sparse[float64, nan]
1    Sparse[float64, nan]
2    Sparse[float64, nan]
3    Sparse[float64, nan]
dtype: object

In [None]:
sdf.sparse.density

0.0002

To Do
- Read and try from the pandas documentation - https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
- Explore Categorical data with a kaggle dataset