# Data Transformation for Datetime

In [None]:
import pandas as pd

d = {'time of measurement': pd.date_range(start='2021-01-01 11:00', freq='24H', periods=7), 
     'number of cars': [60, 412, 230, 1234, 854, 1432, 1103]}
df = pd.DataFrame(data=d)
df

In [None]:
df['day of the week'] = df["time of measurement"].dt.day_name()
df

In [None]:
df['daytype'] = df['day of the week'].isin(['Saturday', 'Sunday'])
df["daytype"].replace({False: 'weekday', True: 'weekend'}, inplace=True)
df.loc[0,"daytype"]="holiday"
df


In [None]:
df_onehot = pd.get_dummies(df.daytype)
df_onehot

In [None]:
final_df = pd.concat([df,df_onehot],axis=1)
final_df

# Data Scaling

In [None]:
from azureml.core import Workspace
ws = Workspace.from_config()

In [None]:
import matplotlib.pyplot as plt 
from azureml.core import Datastore, Dataset


# retrieve an existing datastore in the workspace by name
datastore_name = 'mldemoblob'
datastore = Datastore.get(ws, datastore_name)

# create a TabularDataset from the file path in datastore
datastore_path = [(datastore, 'melb_data.csv')]
tabdf = Dataset.Tabular.from_delimited_files(path=datastore_path)

# create panda dataframe
raw_df = tabdf.to_pandas_dataframe()
raw_df.head()

In [None]:
df = raw_df[{"BuildingArea","Price"}].dropna(how='any')
df.reset_index(drop=True, inplace=True)
df


In [None]:
from sklearn.preprocessing import StandardScaler
stdscaler = StandardScaler()
stdscaler.fit(df)
stdscalerarray = stdscaler.transform(df)
stdscaled_df = pd.DataFrame(stdscalerarray, columns = ["StdSc(Price)", "StdSc(BuildingArea)"])
stdscaled_df.describe()

In [None]:
stdscaled_df.boxplot(figsize=(12,8))
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler()
mmscaler.fit(df)
mmscalerarray = mmscaler.transform(df)
mmscaled_df = pd.DataFrame(mmscalerarray, columns = ["MinMaxSc(Price)", "MinMaxSc(BuildingArea)"])
mmscaled_df.describe()

In [None]:
mmscaled_df.boxplot(figsize=(12,8))
plt.show()

In [None]:
from sklearn.preprocessing import RobustScaler
robscaler = RobustScaler()
robscaler.fit(df)
robscalerarray = robscaler.transform(df)
robscaled_df = pd.DataFrame(robscalerarray, columns = ["RobustSc(Price)", "RobustSc(BuildingArea)"])
robscaled_df.describe()

In [None]:
robscaled_df.boxplot(figsize=(12,8))
plt.show()

In [None]:
scaled_df = pd.concat([df,stdscaled_df,mmscaled_df,robscaled_df],axis=1)
scaled_df

In [None]:
only_price_df = scaled_df[["Price", "StdSc(Price)","MinMaxSc(Price)","RobustSc(Price)"]]
dist_only_price_df = only_price_df.describe().T.apply(lambda s: s.apply(lambda x: format(x, 'g')))
dist_only_price_df

In [None]:
scaled_price_df = scaled_df[["MinMaxSc(Price)"]]
scaled_price_df.boxplot(figsize=(12,8))
plt.show()

In [None]:
scaled_price_df = scaled_df[["StdSc(Price)", "RobustSc(Price)"]]
scaled_price_df.boxplot(figsize=(12,8))
plt.show()

# Encoding Samples

In [None]:
enc_df = pd.read_csv(r'.\favoritesnacks.csv')
enc_df

In [None]:
cntenc = (enc_df.groupby("Favorite Snack").size())
enc_df['CntEnc(FavSnack)'] = enc_df["Favorite Snack"].apply(lambda x : cntenc[x])
enc_df


In [None]:
frenc = (enc_df.groupby("Favorite Snack").size()) / len(enc_df)
enc_df['FreqEnc(FavSnack)'] = enc_df["Favorite Snack"].apply(lambda x : frenc[x])
enc_df

In [None]:
# pip install category_encoders
from category_encoders import TargetEncoder
encoder = TargetEncoder()
enc_df["TargetEnc(FavSnack)"] = encoder.fit_transform(enc_df["Favorite Snack"],enc_df["Likelihood to Buy"])
enc_df