# Data Transformation for Datetime

Let'S create a dataframe for the measurement of cars passing on a street.

In [None]:
import pandas as pd

d = {'time of measurement': pd.date_range(start='2021-01-01 11:00', freq='24H', periods=7), 
     'number of cars': [60, 412, 230, 1234, 854, 1432, 1103]}
df = pd.DataFrame(data=d)
df

Now, let us add the day of the week of those chosen measurement days.

In [None]:
df['day of the week'] = df["time of measurement"].dt.day_name()
df

Next, let us add a new field called daytype, which describes if it is a workday, a holiday or a weekend.

In [None]:
df['daytype'] = df['day of the week'].isin(['Saturday', 'Sunday'])
df["daytype"].replace({False: 'weekday', True: 'weekend'}, inplace=True)
df.loc[0,"daytype"]="holiday"
df


Next, let us convert the daytype feature into a one-hot encoding feature.

In [None]:
df_onehot = pd.get_dummies(df.daytype)
df_onehot

Finally, let us add the one-hot encoded feature to the original table.

In [None]:
final_df = pd.concat([df,df_onehot],axis=1)
final_df

# Data Scaling

First, let us load the original melbourne housing dataset from storage.

In [None]:
from azureml.core import Workspace
ws = Workspace.from_config()

In [None]:
import matplotlib.pyplot as plt 
from azureml.core import Datastore, Dataset


# retrieve an existing datastore in the workspace by name
datastore_name = 'mldemoblob'
datastore = Datastore.get(ws, datastore_name)

# create a TabularDataset from the file path in datastore
datastore_path = [(datastore, 'melb_data.csv')]
tabdf = Dataset.Tabular.from_delimited_files(path=datastore_path)

# create panda dataframe
raw_df = tabdf.to_pandas_dataframe()
raw_df.head()

Next, let us concentrate on the feature BuildingArea and the target Price.

In [None]:
df = raw_df[{"BuildingArea","Price"}].dropna(how='any')
df.reset_index(drop=True, inplace=True)
df

Now, let us use the StandardScaler to create scaled versions of BuildingArea and Price.

In [None]:
from sklearn.preprocessing import StandardScaler
stdscaler = StandardScaler()
stdscaler.fit(df)
stdscalerarray = stdscaler.transform(df)
stdscaled_df = pd.DataFrame(stdscalerarray, columns = ["StdSc(Price)", "StdSc(BuildingArea)"])
stdscaled_df.describe()

... and plot both of them.

In [None]:
stdscaled_df.boxplot(figsize=(12,8))
plt.show()

Now, let us do the same with the MinMaxScaler.

In [None]:
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler()
mmscaler.fit(df)
mmscalerarray = mmscaler.transform(df)
mmscaled_df = pd.DataFrame(mmscalerarray, columns = ["MinMaxSc(Price)", "MinMaxSc(BuildingArea)"])
mmscaled_df.describe()

In [None]:
mmscaled_df.boxplot(figsize=(12,8))
plt.show()

... and with the RobustScaler.

In [None]:
from sklearn.preprocessing import RobustScaler
robscaler = RobustScaler()
robscaler.fit(df)
robscalerarray = robscaler.transform(df)
robscaled_df = pd.DataFrame(robscalerarray, columns = ["RobustSc(Price)", "RobustSc(BuildingArea)"])
robscaled_df.describe()

In [None]:
robscaled_df.boxplot(figsize=(12,8))
plt.show()

Let us incorporate all the scaled versions into the original dataframe.

In [None]:
scaled_df = pd.concat([df,stdscaled_df,mmscaled_df,robscaled_df],axis=1)
scaled_df

... and have a look at there statistical properties.

In [None]:
only_price_df = scaled_df[["Price", "StdSc(Price)","MinMaxSc(Price)","RobustSc(Price)"]]
dist_only_price_df = only_price_df.describe().T.apply(lambda s: s.apply(lambda x: format(x, 'g')))
dist_only_price_df

As we can see, the standard scaler has a mean of 0 and a std deviation of 1, where as the minmax scaler has a minimum value of 0 and a maximum value of 1 and finally the robust scaler has a median of 0. See also the plots below.

In [None]:
scaled_price_df = scaled_df[["MinMaxSc(Price)"]]
scaled_price_df.boxplot(figsize=(12,8))
plt.show()

In [None]:
scaled_price_df = scaled_df[["StdSc(Price)", "RobustSc(Price)"]]
scaled_price_df.boxplot(figsize=(12,8))
plt.show()

# Encoding Samples

First, let us create a dataframe using the data in the csv file.

In [None]:
enc_df = pd.read_csv(r'.\favoritesnacks.csv')
enc_df

Now, let us add a new feature for count enconding, where we count the amount of snacks in the whole sample set and add this as a feature value for each row.

In [None]:
cntenc = (enc_df.groupby("Favorite Snack").size())
enc_df['CntEnc(FavSnack)'] = enc_df["Favorite Snack"].apply(lambda x : cntenc[x])
enc_df

Now, let us do the same using frequency encoding, which does not show the absolute, but the relative amount for each snack item.

In [None]:
frenc = (enc_df.groupby("Favorite Snack").size()) / len(enc_df)
enc_df['FreqEnc(FavSnack)'] = enc_df["Favorite Snack"].apply(lambda x : frenc[x])
enc_df

Finally, let us use a package called category_encoders, which implements a target encoding algorithm to add the final encoder type.

In [None]:
# pip install category_encoders
from category_encoders import TargetEncoder
encoder = TargetEncoder()
enc_df["TargetEnc(FavSnack)"] = encoder.fit_transform(enc_df["Favorite Snack"],enc_df["Likelihood to Buy"])
enc_df