In [8]:
from pyspark.sql import Window
from pyspark.sql.functions import mean

In [4]:
df = spark.read.csv('s3://ui-spark-data/diamonds.csv', inferSchema=True, header=True, sep=',')

In [5]:
df.show()

+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|
| 0.23|Very Good|    H|    VS1| 59.4| 61.0|  338| 4.0|4.05|2.39|
|  0.3|     Good|    J|    SI1| 64.0| 55.0|  339|4.25|4.28|2.73|
| 0.23|    Ideal|    J|    VS1| 62.8| 56.0|  340|3.93| 3.9|2.46|
| 0.22|  Premium|    F|  

In [6]:
window = Window.partitionBy('cut').orderBy('price').rowsBetween(-3, 3)

In [7]:
window

<pyspark.sql.window.WindowSpec at 0x7f0ed4088510>

In [9]:
moving_avg = mean(df['price']).over(window)

In [10]:
moving_avg

Column<avg(price) OVER (PARTITION BY cut ORDER BY price ASC ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING)>

In [15]:
df.withColumn('moving_average', moving_avg).show()

+-----+-------+-----+-------+-----+-----+-----+----+----+----+------------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|    moving_average|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+------------------+
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|            336.75|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|             338.4|
| 0.22|Premium|    F|    SI1| 60.4| 61.0|  342|3.88|3.84|2.33| 341.1666666666667|
|  0.2|Premium|    E|    SI2| 60.2| 62.0|  345|3.79|3.75|2.27| 344.2857142857143|
| 0.32|Premium|    E|     I1| 60.9| 58.0|  345|4.38|4.42|2.68|349.85714285714283|
| 0.24|Premium|    I|    VS1| 62.5| 57.0|  355|3.97|3.94|2.47|354.57142857142856|
| 0.31|Premium|    J|    SI1| 60.9| 60.0|  363|4.36|4.38|2.66|358.14285714285717|
| 0.32|Premium|    J|    SI1| 62.2| 59.0|  365|4.37|4.41|2.73| 361.2857142857143|
|  0.2|Premium|    E|    VS2| 59.8| 62.0|  367|3.79|3.77|2.26|364.42857142857144|
|  0.2|Premium| 

In [16]:
type(moving_avg)

pyspark.sql.column.Column

In [17]:
from pyspark.sql.functions import udf

def switch_col(a, b):
    if a == 61.1:
        return b
    else:
        return a
    
udf_switch = udf(switch_col)

In [18]:
df_new = df.withColumn('repl_sixoneone', udf_switch('depth', 'price'))

In [20]:
df_new.where(df['depth'] == 61.1).show()

+-----+---------+-----+-------+-----+-----+-----+----+----+----+--------------+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|repl_sixoneone|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+--------------+
| 0.33|    Ideal|    J|    SI1| 61.1| 56.0|  403|4.49|4.55|2.76|           403|
|  0.7|Very Good|    D|    SI1| 61.1| 58.0| 2768|5.66|5.73|3.48|          2768|
|  0.7|  Premium|    E|    VS2| 61.1| 60.0| 2777|5.71|5.64|3.47|          2777|
| 0.83|Very Good|    I|    VS1| 61.1| 60.0| 2788|6.07| 6.1|3.72|          2788|
| 0.74|     Fair|    F|    VS2| 61.1| 68.0| 2805|5.82|5.75|3.53|          2805|
| 0.75|  Premium|    E|    SI1| 61.1| 59.0| 2814|5.86|5.83|3.57|          2814|
| 0.76|  Premium|    D|    SI1| 61.1| 59.0| 2847|5.93|5.88|3.61|          2847|
|  0.7|    Ideal|    H|    VS2| 61.1| 57.0| 2862|5.71|5.74| 3.5|          2862|
| 0.72|    Ideal|    D|    SI1| 61.1| 56.0| 2891|5.78|5.81|3.54|          2891|
| 0.76|  Premium|    E|    SI1| 61.1| 58

In [21]:
df_new.where(df['depth'] != 61.1).show()

+-----+---------+-----+-------+-----+-----+-----+----+----+----+--------------+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|repl_sixoneone|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+--------------+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|          61.5|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|          59.8|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|          56.9|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|          62.4|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|          63.3|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|          62.8|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|          62.3|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|          61.9|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|          65.1|
| 0.23|Very Good|    H|    VS1| 59.4| 61