In [1]:
# (1) Import the required Python dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import DenseVector
from pyspark.mllib.linalg.distributed import RowMatrix

In [2]:
# (2) Instantiate a Spark Context
conf = SparkConf().setMaster("spark://192.168.56.10:7077").setAppName("Principal Component Analysis - Movie Ratings")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [3]:
# (3) Load the Pivoted User Movie Ratings into a Spark DataFrame and examine its dimensions
user_movie_ratings_df = sqlContext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true', delimiter = '|').load('/data/workspaces/jillur.quddus/jupyter/notebooks/Machine-Learning-with-Apache-Spark-QuickStart-Guide/chapter05/data/movie-ratings-data/user-movie-ratings.csv')
print((user_movie_ratings_df.count(), len(user_movie_ratings_df.columns)))

(300, 3001)


In [4]:
# (4) Generate MLlib Feature Vectors from all the 3000 (i.e. minus userId column) dimensions (movies)
feature_columns = user_movie_ratings_df.columns
feature_columns.remove('userId')
vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
user_movie_ratings_features_df = vector_assembler.transform(user_movie_ratings_df).select(['userId', 'features'])
user_movie_ratings_features_df.show()

+------+--------------------+
|userId|            features|
+------+--------------------+
|     1|(3000,[0,2,5,43,4...|
|     2|(3000,[277,291,12...|
|     3|(3000,[30,461,545...|
|     4|(3000,[20,31,41,4...|
|     5|(3000,[0,20,32,33...|
|     6|(3000,[1,2,3,4,5,...|
|     7|(3000,[0,46,52,12...|
|     8|(3000,[1,9,10,20,...|
|     9|(3000,[37,158,190...|
|    10|(3000,[257,314,50...|
|    11|(3000,[5,9,33,40,...|
|    12|(3000,[35,140,189...|
|    13|(3000,[43,265,514...|
|    14|(3000,[3,6,18,24,...|
|    15|(3000,[0,40,43,13...|
|    16|(3000,[43,46,98,1...|
|    17|(3000,[0,40,43,46...|
|    18|(3000,[0,1,5,15,3...|
|    19|(3000,[0,1,2,6,9,...|
|    20|(3000,[1,7,12,32,...|
+------+--------------------+
only showing top 20 rows



In [5]:
# (5) Standardise the data by scaling the features to have zero mean and unit standard deviation
standardizer = StandardScaler(withMean=True, withStd=True, inputCol='features', outputCol='std_features')
standardizer_model = standardizer.fit(user_movie_ratings_features_df)
user_movie_ratings_standardized_features_df = standardizer_model.transform(user_movie_ratings_features_df)
user_movie_ratings_standardized_features_df.show()

+------+--------------------+--------------------+
|userId|            features|        std_features|
+------+--------------------+--------------------+
|     1|(3000,[0,2,5,43,4...|[1.30727676187958...|
|     2|(3000,[277,291,12...|[-0.7406763898900...|
|     3|(3000,[30,461,545...|[-0.7406763898900...|
|     4|(3000,[20,31,41,4...|[-0.7406763898900...|
|     5|(3000,[0,20,32,33...|[1.30727676187958...|
|     6|(3000,[1,2,3,4,5,...|[-0.7406763898900...|
|     7|(3000,[0,46,52,12...|[1.56327090585078...|
|     8|(3000,[1,9,10,20,...|[-0.7406763898900...|
|     9|(3000,[37,158,190...|[-0.7406763898900...|
|    10|(3000,[257,314,50...|[-0.7406763898900...|
|    11|(3000,[5,9,33,40,...|[-0.7406763898900...|
|    12|(3000,[35,140,189...|[-0.7406763898900...|
|    13|(3000,[43,265,514...|[-0.7406763898900...|
|    14|(3000,[3,6,18,24,...|[-0.7406763898900...|
|    15|(3000,[0,40,43,13...|[0.53929432996599...|
|    16|(3000,[43,46,98,1...|[-0.7406763898900...|
|    17|(3000,[0,40,43,46...|[1

In [9]:
# (6) Generate a RowMatrix (distributed Matrix with no index where each Row is a vector) from the scaled features DataFrame
scaled_features_rows_rdd = user_movie_ratings_standardized_features_df.select("std_features").rdd
scaled_features_matrix = RowMatrix(scaled_features_rows_rdd.map(lambda x: x[0].tolist()))
print("Scaled Features Matrix Dimensions: \n")
print((scaled_features_matrix.numRows(), scaled_features_matrix.numCols()))
print("\nScaled Features Matrix (1st Row/Vector with 3000 elements): \n")
scaled_features_matrix_collected = scaled_features_matrix.rows.collect()
print(scaled_features_matrix_collected[0])

Scaled Features Matrix Dimensions: 

(300, 3000)

Scaled Features Matrix (1st Row/Vector with 3000 elements): 

[1.3072767618795875,-0.41490057621793586,3.5375995315133344,-0.12413332029054493,-0.29232139157045567,2.3149832826753065,-0.321364665195093,-0.10484582518547665,-0.12713049265287288,-0.48201250142180474,-0.34405799994956315,-0.1652870138707658,-0.1129182582625171,-0.1417654493456247,-0.12578906440049703,-0.36229199172702486,-0.32924500259439565,-0.18030655802588758,-0.3645955110290761,-0.14563526140600572,-0.4035434227397829,-0.20932104168048926,-0.16866421037684115,-0.1938318591111947,-0.37393037757146147,-0.1531531225270802,-0.08018795125754069,-0.12700283554257744,-0.2561028259896872,0.0,-0.24359263055411967,-0.6104141365818677,-0.4718136368943106,-0.3601941429542023,-0.08128298495691468,-0.4340750718344838,0.0,-0.1528228587163035,-0.057735026918962574,-0.09678076170967075,-0.27192717117793036,-0.21513569568174448,-0.14162158571989528,1.8851405059502864,-0.3492354698576519

In [7]:
# (7) Compute the top 300 principal components (eigenvectors sorted by their corresponding eigenvalues)
number_principal_components = 300
principal_components = scaled_features_matrix.computePrincipalComponents(number_principal_components)
print("Top %d Principal Components: \n" % number_principal_components)
print(principal_components)

Top 300 Principal Components: 

DenseMatrix([[-2.26025023e-02, -5.83666342e-03,  3.54067168e-03, ...,
              -3.08088044e-01, -5.20895547e-01,  1.42120913e-02],
             [-2.69816586e-02,  1.01911199e-02,  4.52645020e-03, ...,
              -2.68507903e-02, -2.02412549e-04,  2.43784758e-01],
             [-2.21792500e-02,  1.71186397e-02, -3.49573662e-03, ...,
               2.13933995e-02, -1.06985702e-05, -9.07297089e-02],
             ...,
             [-1.36515538e-02, -3.91842123e-02,  2.85213135e-02, ...,
              -3.09378585e-04, -1.02077870e-02,  5.59642329e-04],
             [-1.75252096e-02, -1.35609961e-02,  2.26058887e-02, ...,
               3.60095944e-02, -3.91418235e-04, -5.37917465e-03],
             [-1.81504641e-02, -2.04315059e-02, -4.44993070e-04, ...,
              -2.62856441e-03,  5.28657616e-03,  1.82515683e-04]])


In [10]:
# (8) Project the original User Movie Ratings dataset from 3000 dimensions into 300 dimensions
# (via Matrix multiplication of the scaled features matrix with the matrix of principal components)
projected_matrix = scaled_features_matrix.multiply(principal_components)
print("Projected Matrix Dimensions: \n")
print((projected_matrix.numRows(), projected_matrix.numCols()))
print("\nProjected Matrix (1st Row/Vector with 300 elements): \n")
projected_matrix_collected = projected_matrix.rows.collect()
print(projected_matrix_collected[0])

Projected Matrix Dimensions: 

(300, 300)

Projected Matrix (1st Row/Vector with 300 elements): 

[-20.502263804758883,-2.601630402551618,1.765855389379487,-1.886908727572231,5.77802899184358,5.8527310404206565,7.2128326719293225,-9.598732006829982,1.7578675287107695,3.0907105744549774,2.1852291322943644,-9.780726046386926,8.036736357297778,7.968420643845778,18.037345053909675,1.3425307514349782,-4.257116490623351,-13.44991019605675,7.256533507504017,-1.3645891687223979,1.7954071695678067,47.915705218924636,38.772709154573384,-4.522498852365555,-29.68348302863417,26.253221884980825,-28.823759164165125,18.388956693727202,-2.021980092701811,0.7540819460732486,1.75980203582433,-7.181154839369973,0.7161425093821583,6.274512232738007,-3.973405406054397,-2.1086830204991975,-5.313205672454199,1.3203638144601109,-1.708377144280659,-4.041665301481718,1.251843405549477,3.1538905741665877,5.279440891632737,-2.5771082035633452,4.038145381515312,1.9228788405300152,-0.8143819523661725,0.181653933246

In [11]:
# (9) Alternatively use MLlib's PCA estimator directly on the scaled DataFrame
pca = PCA(k=number_principal_components, inputCol="std_features", outputCol="pca_features")
pca_model = pca.fit(user_movie_ratings_standardized_features_df)
user_movie_ratings_pca_df = pca_model.transform(user_movie_ratings_standardized_features_df)
user_movie_ratings_pca_df.show()

+------+--------------------+--------------------+--------------------+
|userId|            features|        std_features|        pca_features|
+------+--------------------+--------------------+--------------------+
|     1|(3000,[0,2,5,43,4...|[1.30727676187958...|[-20.502263804759...|
|     2|(3000,[277,291,12...|[-0.7406763898900...|[7.56627528338625...|
|     3|(3000,[30,461,545...|[-0.7406763898900...|[6.42605103583382...|
|     4|(3000,[20,31,41,4...|[-0.7406763898900...|[-5.2076851522311...|
|     5|(3000,[0,20,32,33...|[1.30727676187958...|[6.02182695509906...|
|     6|(3000,[1,2,3,4,5,...|[-0.7406763898900...|[-15.089540317929...|
|     7|(3000,[0,46,52,12...|[1.56327090585078...|[3.80979953979567...|
|     8|(3000,[1,9,10,20,...|[-0.7406763898900...|[5.86630895281757...|
|     9|(3000,[37,158,190...|[-0.7406763898900...|[6.39178510966064...|
|    10|(3000,[257,314,50...|[-0.7406763898900...|[7.03876786190874...|
|    11|(3000,[5,9,33,40,...|[-0.7406763898900...|[2.95625036980

In [12]:
# (10) Extract the Explained Variance (vector of proportions of variance explained) for each Principal Component
pca_model.explainedVariance

DenseVector([0.0819, 0.0401, 0.0386, 0.0333, 0.0315, 0.0296, 0.0278, 0.0229, 0.0212, 0.0192, 0.0188, 0.0181, 0.0177, 0.0166, 0.0166, 0.0156, 0.0151, 0.0143, 0.0133, 0.0121, 0.0115, 0.0111, 0.0106, 0.0097, 0.0095, 0.0094, 0.0093, 0.0091, 0.0086, 0.0086, 0.0081, 0.0078, 0.0076, 0.0076, 0.0073, 0.0072, 0.0068, 0.0066, 0.0065, 0.0064, 0.0062, 0.006, 0.0059, 0.0056, 0.0055, 0.0053, 0.0052, 0.0052, 0.0051, 0.005, 0.0048, 0.0047, 0.0046, 0.0044, 0.0043, 0.0042, 0.0041, 0.0041, 0.0039, 0.0039, 0.0038, 0.0038, 0.0037, 0.0036, 0.0036, 0.0035, 0.0035, 0.0034, 0.0034, 0.0033, 0.0032, 0.0031, 0.0031, 0.003, 0.003, 0.003, 0.0029, 0.0028, 0.0028, 0.0028, 0.0027, 0.0027, 0.0026, 0.0025, 0.0025, 0.0025, 0.0024, 0.0023, 0.0023, 0.0023, 0.0022, 0.0022, 0.0022, 0.0022, 0.0021, 0.0021, 0.0021, 0.0021, 0.002, 0.002, 0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.0017, 0.0017, 0.0017, 0.0017, 0.0016, 0.0016, 0.0016, 0.0015, 0.0015, 0.0015, 0.0015, 0.0014, 0.0014, 0.0014, 0.0013, 0.0013, 0.

In [13]:
# (11) Stop the Spark Context
sc.stop()