In [2]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('transform').getOrCreate()


In [3]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [4]:
df = spark.read.load("./finalEdited-HumanHappiness-BDAS-mining.csv", format="csv", header=True, inferSchema=True)

# vector Assembler

In [5]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=['Year',
 'Rule of Law',
 'Disappearances, Conflicts, and Terrorism',
 'Women Security & Safety',
 'Security & Safety',
 'Women Movement',
 'Movement',
 'Legal and Regulatory Restrictions',
 'State Control over Internet Access',
 'Expression & Information',
 'Same Sex Relationships',
 'Divorce',
 'PERSONAL FREEDOM (Score)',
 'PERSONAL FREEDOM (Rank)',
 'Government  enterprises and investment',
 'Top marginal income tax rate',
 'Legal enforcement of contracts',
 'Reliability of police',
 'Gender Legal Rights Adjustment',
 'Money growth',
 'Inflation: Most recent year',
 'Compliance costs of importing and exporting',
 'Regulatory trade barriers',
 'Foreign ownership/investment restrictions',
 'Freedom to trade internationally',
 'Hiring regulations and minimum wage',
 'Labour market regulations',
 'Licensing restrictions',
 'Business regulations',
 'ECONOMIC FREEDOM (Score)',
 'ECONOMIC FREEDOM (Rank)'],
    outputCol="features")

In [6]:
# Now that we've created the assembler variable, let's actually transform the data.
output = assembler.transform(df)

In [9]:
output.show(2, truncate=False)

+----+---------+--------------------------+---------------------+--------------------+------------------------+-----------+----------------------------------------+-----------------------+-----------------+--------------+-----------+---------------------------------+----------------------------------+------------------------+----------------------+-------+------------------------+-----------------------+--------------------------------------+----------------------------+------------------------------+---------------------+------------------------------+------------+---------------------------+-------------------------------------------+-------------------------+-----------------------------------------+--------------------------------+-----------------------------------+-------------------------+----------------------+--------------------+------------------------+-----------------------+----------------------------------------------------------------------------------------------------

# StandardScaler

In [15]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
standard_scaler = StandardScaler(inputCol="features", outputCol="scaled")

In [16]:
train = standard_scaler.fit(output).transform(output)

In [17]:
train.select("scaled").show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaled                                                                                                                                                                                                                                                                                                                                                                                                                                    

# MinMaxScaler

In [19]:
from pyspark.ml.feature import MinMaxScaler

In [20]:
output.select("features").show(2)

+--------------------+
|            features|
+--------------------+
|[2017.0,5.2917516...|
|[2017.0,3.7960803...|
+--------------------+
only showing top 2 rows



In [21]:
minmax_scaler = MinMaxScaler(inputCol="features", outputCol="scaled")

In [22]:
train = minmax_scaler.fit(output).transform(output)

In [23]:
train.select("scaled").show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaled                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+-----------------------------------------------------------------------