In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Numerical Data

### Log Function

$\log_{b} (x) = y$, if $b^y = x$

In [None]:
x = np.arange(1,100)

ylog10 = np.log10(x)
ylog2 = np.log2(x)
yln = np.log(x)
# print(f'x: {x}')
# print(f'y: {y}')

In [None]:
plt.figure(dpi=200)
plt.plot(x,ylog2,color="red",label="y=log2(x)")
plt.plot(x,ylog10, color="green",label="y=log10(x)")
plt.plot(x,yln,color="blue",label="y=logE(x)")

plt.title('Logarithm Functions')
plt.xlabel("x")
plt.ylabel("y")
plt.legend()

## Quantile-based discretization function

`qcut()` - Discretize variable into equal-sized buckets
https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.qcut.html

`cut()` like `qcut()`, but here you can specifically define the bin edges
https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.cut.html

**Usage**
- divide customers into 3, 4 or N groupings
- income by quarters
- whenever you need to convert continuous variable into a categorical one.

In [None]:
# x = np.arange(1,100)
x = np.random.randint(101, size=100)

bins = pd.qcut(x,10)
print(f'bins: \n{bins}\n')

values_count = bins.value_counts()
print(f':values_count \n{values_count}')

In [None]:
# let's plot our bins:
ax = values_count.plot.bar(rot=0, color="b", figsize=(12,4))

plt.grid(True)
# plt.show()

## Transformation of categorical labels into numeric values

### Useful methods

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html

In [None]:
# create a very simple DF:
df = pd.DataFrame({
    'gender':['m','f','f','m'],
    'height': [186,164,158,195],
    'y':[3,1,0,5]
})
df.head()


In [None]:
df['gender']

In [None]:
mappings = {'m':0,'f':1}


df['gender'] = df['gender'].map(mappings)
df

## Scikit-learn Preprocessing Module 

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing