# COURSE: Improving data quality in data analytics & machine learning
## SECTION: Data transformations
### LECTURE: Code: Transforming data
#### TEACHER: Mike X Cohen, sincxpress.com
##### COURSE URL: udemy.com/course/dataquality_x/?couponCode=202204

In [None]:
# import libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Z-score scaling

In [None]:
# import the data from the web
url = "https://sincxpress.com/sampledata.csv"
df = pd.read_csv(url)

df.describe()

In [None]:
sns.boxplot(data=df, palette='Purples')
sns.stripplot(data=df,marker='.',color='k');

In [None]:
# Now z-score!
df_z = (df - df.mean(skipna=True)) / df.std(ddof=1)

# note: skipna=True is default!

In [None]:
# now describe/plot again

sns.boxplot(data=df_z, palette='Purples')
sns.stripplot(data=df_z,marker='.',color='k')

df_z.describe()

# Min/max scaling

In [None]:
# in pandas

# start by getting min/max values per column
dmax = df.max()
dmin = df.min()

df_mm = (df-dmin) / (dmax-dmin)

In [None]:
sns.boxplot(data=df_mm, palette='Purples')
sns.stripplot(data=df_mm,marker='.',color='k')
df_mm.describe()

In [None]:
# in scikit-learn
from sklearn.preprocessing import MinMaxScaler

# make some data
data = np.random.randn(500,1)

# create the scalar isntance
scalarfun = MinMaxScaler((-.5,17.3))

# apply it to data
scalarfun.fit(data)
data_s = scalarfun.transform(data)

# print out minimum and maximum values
data_s.min(),data_s.max()

In [None]:
h = sns.boxplot(data=[data,data_s], palette='Purples')
sns.stripplot(data=[data,data_s],marker='.',color='k')
h.set_xticklabels(['Original','Scaled'])
plt.show()

# Binning

In [None]:
# numpy's histogram

# get the bin counts
bincounts,boundaries = np.histogram(data,bins=2)

# (optional) convert to proportion
proportion = bincounts / np.sum(bincounts)

print(boundaries)
print(bincounts)
print(proportion)

In [None]:
# but we need the individual data values...

# specify the bin boundaries
nbins = 5
boundaries = np.linspace( np.min(data),np.max(data)*1.01,nbins+1 )

# bin the data
binvalues = np.digitize(data,boundaries)

# and plot
plt.plot(binvalues,data,'o')
plt.xticks(np.arange(1,nbins+1))
plt.xlim([0,nbins+1])
plt.xlabel('Binned values')
plt.ylabel('Original values')
plt.show()

# Unit normalization

In [None]:
# in numpy

# get the data norm
norm = np.linalg.norm(data)
print(f'Original data norm: {norm:.3f}')

# norm the data
data_u = data / norm
print(f'Unit-normed data norm: {np.linalg.norm(data_u):.3f}')


# and plot
plt.plot(data,data_u,'o')
plt.xlabel('Original')
plt.ylabel('Unit-normed')
plt.title(f'Correlation: {np.corrcoef(data.T,data_u.T)[0,1]}')
plt.show()

In [None]:
# in pandas

# uh oh...
norms = np.linalg.norm(df)
norms

In [None]:
# do it manually
norms = np.sqrt(np.sum(np.square(df)))
norms

df_u = df.divide(norms)
df_u

# Rank transform

In [None]:
# in scipy
from scipy.stats import rankdata

data_rank = rankdata(data)

fig,axs = plt.subplots(1,2,figsize=(15,6))

axs[0].plot(np.sort(data,axis=0),'o')
axs[0].set_title('Sorted original data')
axs[0].set_xlabel('Data index')
axs[0].set_ylabel('Data value')

axs[1].plot(np.sort(data_rank),'o')
axs[1].set_title('Sorted rank-transformed data')
axs[1].set_xlabel('Data index')
axs[1].set_ylabel('Data value')

plt.show()

In [None]:
# in pandas

df.rank()