In [30]:
from pyspark.sql import SparkSession

filepath = '/Users/williamtun/Documents/Code/DataEngineer/pyspark/iris.csv'

spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').\
                load(filepath,header=True)

df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [31]:
from pyspark.sql import functions as F
from pyspark.sql.functions import rank,sum,col
from pyspark.sql import Window

window = Window.rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)

In [32]:
df2 = df.withColumnRenamed("sepal.length","sepal_length") \
    .withColumnRenamed("sepal.width","sepal_width") \
    .withColumnRenamed("petal.length","petal_length") \
    .withColumnRenamed("petal.width","petal_width") 

# summary stats

In [None]:
# sum up total of entire column of sepal_num
# ^ needed to calculate percentage

In [45]:
tab = df2.select(['variety', 'sepal_length']).\
   groupBy('variety').\
   agg(F.count('sepal_length').alias('sepal_num'), 
       F.mean('sepal_length').alias('sepal_avg'),
       F.min('sepal_length').alias('sepal_min'),
       F.max('sepal_length').alias('sepal_max')).\
       withColumn('total',sum(col('sepal_num')).over(window)).\
       withColumn('Percent',col('sepal_num')*100/col('total')).\
       drop(col('total'))


In [46]:
tab.show()

+----------+---------+-----------------+---------+---------+------------------+
|   variety|sepal_num|        sepal_avg|sepal_min|sepal_max|           Percent|
+----------+---------+-----------------+---------+---------+------------------+
| Virginica|       50|6.587999999999998|      4.9|      7.9|33.333333333333336|
|    Setosa|       50|5.005999999999999|      4.3|      5.8|33.333333333333336|
|Versicolor|       50|            5.936|      4.9|      7.0|33.333333333333336|
+----------+---------+-----------------+---------+---------+------------------+



# Correlation matrix

In [48]:
from pyspark.mllib.stat import Statistics
import pandas as pd

In [47]:
corr_data = df2.select(['sepal_length', 'sepal_width', 'petal_length'])

In [50]:
col_names = corr_data.columns
col_names

['sepal_length', 'sepal_width', 'petal_length']

In [51]:
features = corr_data.rdd.map(lambda row: row[0:])

In [59]:
features.collect()[:3] # show first 3 rows of dataset

[(5.1, 3.5, 1.4), (4.9, 3.0, 1.4), (4.7, 3.2, 1.3)]

In [60]:
# correlation matrix
corr_mat=Statistics.corr(features, method="pearson")

In [61]:
# display in pandas
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names
print(corr_df.to_string())

              sepal_length  sepal_width  petal_length
sepal_length      1.000000     -0.11757      0.871754
sepal_width      -0.117570      1.00000     -0.428440
petal_length      0.871754     -0.42844      1.000000


# cross table - count values in each category

In [64]:
df2.stat.crosstab("sepal_length", "variety").show()

+--------------------+------+----------+---------+
|sepal_length_variety|Setosa|Versicolor|Virginica|
+--------------------+------+----------+---------+
|                 5.0|     8|         2|        0|
|                 4.7|     2|         0|        0|
|                 7.6|     0|         0|        1|
|                 7.4|     0|         0|        1|
|                 6.1|     0|         4|        2|
|                 7.9|     0|         0|        1|
|                 4.3|     1|         0|        0|
|                 7.2|     0|         0|        3|
|                 6.0|     0|         4|        2|
|                 4.4|     3|         0|        0|
|                 4.6|     4|         0|        0|
|                 6.4|     0|         2|        5|
|                 6.8|     0|         1|        2|
|                 5.7|     2|         5|        1|
|                 5.1|     8|         1|        0|
|                 6.9|     0|         1|        3|
|                 5.3|     1|  