# Data Normalization 

A subset of data transformation which focuses on transforming the skewed features of the dataset into normal features.<br>
Typically used when the dataset has skewed distribution and the algorithms assume normality of the features<br> 

In [38]:
# import necessary libraries 
from sklearn.preprocessing import normalize,Normalizer
import random 

i. normalize function : A quick and easy  way to perform normalization on a single arraylike dataset either using l1,l2 or max norm

In [39]:
# creating a random sample dataset 
data = [[random.randint(0,100) for _ in range(10)] for _ in range(10)]
data

[[52, 63, 13, 46, 79, 92, 46, 19, 100, 0],
 [94, 47, 80, 33, 25, 17, 94, 77, 94, 43],
 [0, 50, 51, 61, 89, 87, 97, 72, 18, 47],
 [13, 65, 79, 18, 36, 77, 87, 95, 53, 52],
 [26, 23, 25, 34, 59, 60, 56, 73, 2, 83],
 [79, 5, 38, 70, 62, 50, 48, 54, 31, 13],
 [50, 10, 20, 86, 79, 55, 1, 18, 58, 37],
 [75, 74, 5, 26, 45, 59, 84, 70, 80, 17],
 [12, 25, 14, 7, 13, 80, 62, 68, 56, 8],
 [48, 58, 65, 33, 31, 66, 83, 99, 41, 30]]

In [40]:
normalized_data = normalize(data,norm='l2') # using l2 norm 
normalized_data.tolist()

[[0.27353271081333597,
  0.331395399639234,
  0.06838317770333399,
  0.24197124418102797,
  0.4155593106587219,
  0.48394248836205594,
  0.24197124418102797,
  0.09994464433564199,
  0.5260244438717999,
  0.0],
 [0.443623307853901,
  0.2218116539269505,
  0.37755175136502217,
  0.15574009743807163,
  0.11798492230156941,
  0.0802297471650672,
  0.443623307853901,
  0.3633935606888338,
  0.443623307853901,
  0.2029340663586994],
 [0.0,
  0.24562389350523078,
  0.2505363713753354,
  0.29966115007638155,
  0.4372105304393108,
  0.42738557469910154,
  0.47651035340014775,
  0.3536984066475323,
  0.08842460166188308,
  0.23088645989491693],
 [0.0647648447905696,
  0.32382422395284793,
  0.3935709798811537,
  0.08967440047925021,
  0.17934880095850042,
  0.3836071576056814,
  0.43342626898304265,
  0.4732815580849316,
  0.2640412903000145,
  0.2590593791622784],
 [0.16305828385315688,
  0.14424386648548493,
  0.15678681139726622,
  0.21323006350028206,
  0.3700168748975483,
  0.3762883473534

ii. Using Normalizer class

In [41]:
normalizer = Normalizer(norm='l2')
normalized_data_2 = normalizer.transform(data) 
normalized_data_2.tolist()

[[0.27353271081333597,
  0.331395399639234,
  0.06838317770333399,
  0.24197124418102797,
  0.4155593106587219,
  0.48394248836205594,
  0.24197124418102797,
  0.09994464433564199,
  0.5260244438717999,
  0.0],
 [0.443623307853901,
  0.2218116539269505,
  0.37755175136502217,
  0.15574009743807163,
  0.11798492230156941,
  0.0802297471650672,
  0.443623307853901,
  0.3633935606888338,
  0.443623307853901,
  0.2029340663586994],
 [0.0,
  0.24562389350523078,
  0.2505363713753354,
  0.29966115007638155,
  0.4372105304393108,
  0.42738557469910154,
  0.47651035340014775,
  0.3536984066475323,
  0.08842460166188308,
  0.23088645989491693],
 [0.0647648447905696,
  0.32382422395284793,
  0.3935709798811537,
  0.08967440047925021,
  0.17934880095850042,
  0.3836071576056814,
  0.43342626898304265,
  0.4732815580849316,
  0.2640412903000145,
  0.2590593791622784],
 [0.16305828385315688,
  0.14424386648548493,
  0.15678681139726622,
  0.21323006350028206,
  0.3700168748975483,
  0.3762883473534

<b>Interpretation:</b> We get the same normalized data as output from both normalize function and Normalizer class

Normalization using manual formula 

X = array of all int datatype<br>
X_normalized = (X - mean(X))/std(X) 

In [None]:
X = [[random.randint(0,100) for _ in range(10)] for _ in range(10)]
mean_X = [sum(col)/len(col) for col in zip(*X)]
std_X = [(sum((i - mean)**2 for i in col)/len(col))**0.5 for col, mean in zip(zip(*X), mean_X)]
X_normalized = [
    [(val - mean)/std for val in col]
    for col, mean, std in zip(zip(*X), mean_X, std_X)
]
X_normalized = list(zip(*X_normalized))
X_normalized

[(2.0482447665935957,
  0.0636903063861404,
  0.030385979497367207,
  -0.8253283208862453,
  1.7081146606386157,
  0.24838245264940184,
  -0.020511891512546312,
  -0.14078710723167925,
  0.39315972198521904,
  0.4162441203149241),
 (-1.1815224302306369,
  0.41109197758326943,
  1.2458251593920597,
  -0.4501790841197702,
  0.5544860227345397,
  1.5221386200822322,
  -0.08888486322103385,
  -1.273951628852512,
  -1.2498958325798757,
  -0.4808337251913777),
 (-0.5273923650510455,
  1.0769451807111001,
  0.5621406207012952,
  -0.8594327969559249,
  1.4476178714344696,
  -1.4075005650132777,
  -1.832395641787466,
  0.40862599416023965,
  -0.13496527769641853,
  -0.9831973186749067),
 (-1.3450549465255348,
  1.0190449021782453,
  -1.2230356747690345,
  1.3573581475732461,
  -0.7479979232861913,
  1.0126361531091002,
  1.0734556558232544,
  -0.5528469332756184,
  -1.3672569436202395,
  -1.090846660135663),
 (-0.28209359060869876,
  0.034740167119712985,
  -1.7547903159729625,
  -1.47331336621

<b>Interpretation:</b> We got different output from using manual formula based normalization because it has utilized l1 norm instead of l2 norm.