In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

# Normalizing Data

In [2]:
from pyspark.ml.feature import MinMaxScaler

In [3]:
from pyspark.ml.linalg import Vectors

In [5]:
features_df = spark.createDataFrame([
    (1, Vectors.dense([10.0, 10000.0, 1.0]),),
    (2, Vectors.dense([20.0, 30000.0, 2.0]),),
    (1, Vectors.dense([30.0, 40000.0, 3.0]),),
],
    ["id", "features"]
)

features_df.show(2)

+---+------------------+
| id|          features|
+---+------------------+
|  1|[10.0,10000.0,1.0]|
|  2|[20.0,30000.0,2.0]|
+---+------------------+
only showing top 2 rows



In [6]:
feature_scaler = MinMaxScaler(inputCol="features", outputCol="sfeatures")

In [7]:
smodel = feature_scaler.fit(features_df)

In [8]:
sfeatures_df = smodel.transform(features_df)

In [11]:
sfeatures_df.show(truncate=False)

+---+------------------+----------------------------+
|id |features          |sfeatures                   |
+---+------------------+----------------------------+
|1  |[10.0,10000.0,1.0]|(3,[],[])                   |
|2  |[20.0,30000.0,2.0]|[0.5,0.6666666666666667,0.5]|
|1  |[30.0,40000.0,3.0]|[1.0,1.0,1.0]               |
+---+------------------+----------------------------+



# Standardizing Data

In [12]:
from pyspark.ml.feature import StandardScaler

In [19]:
feature_std_scaler = StandardScaler(inputCol="features", outputCol="sfeatures", withStd=True, withMean=True)

In [20]:
stdmodel = feature_std_scaler.fit(features_df)

In [21]:
stdfeatures_df = stdmodel.transform(features_df)

In [22]:
stdfeatures_df.show(truncate=False)

+---+------------------+------------------------------+
|id |features          |sfeatures                     |
+---+------------------+------------------------------+
|1  |[10.0,10000.0,1.0]|[-1.0,-1.091089451179962,-1.0]|
|2  |[20.0,30000.0,2.0]|[0.0,0.2182178902359923,0.0]  |
|1  |[30.0,40000.0,3.0]|[1.0,0.8728715609439696,1.0]  |
+---+------------------+------------------------------+

