In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.range(4).count()

4

In [36]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType
from pyspark.sql.functions import to_json, col, get_json_object, struct, current_date
from pyspark.sql.functions import array, lit, explode, monotonically_increasing_id
from random_words import RandomWords
from tabulate import tabulate

schema = StructType(
    [StructField("name1", StringType(), True),
     StructField("name2", StringType(), True)])

random_words_expr = array([lit(RandomWords().random_word()) 
                           for _ in range(10)])

inner_struct = struct(col("col"), col("id"))
outer_struct = struct(inner_struct, current_date().alias("today"))
                      
df = spark.range(1)\
          .select(explode(random_words_expr))\
          .withColumn("id", monotonically_increasing_id())\
          .select(to_json(outer_struct).alias("json"))
print(tabulate(df.collect(), df.columns))
print("")
df.printSchema()

json
--------------------------------------------------------------------
{"col1":{"col":"tents","id":25769803776},"today":"2017-09-19"}
{"col1":{"col":"proofs","id":25769803777},"today":"2017-09-19"}
{"col1":{"col":"deletion","id":25769803778},"today":"2017-09-19"}
{"col1":{"col":"tuesdays","id":25769803779},"today":"2017-09-19"}
{"col1":{"col":"settings","id":25769803780},"today":"2017-09-19"}
{"col1":{"col":"headsets","id":25769803781},"today":"2017-09-19"}
{"col1":{"col":"cleanliness","id":25769803782},"today":"2017-09-19"}
{"col1":{"col":"concern","id":25769803783},"today":"2017-09-19"}
{"col1":{"col":"picks","id":25769803784},"today":"2017-09-19"}
{"col1":{"col":"warship","id":25769803785},"today":"2017-09-19"}

root
 |-- json: string (nullable = true)



In [37]:
from pyspark.sql.functions import udf

spark.range(10).select(udf(lambda x: x**3)("id")).show()

+------------+
|<lambda>(id)|
+------------+
|           0|
|           1|
|           8|
|          27|
|          64|
|         125|
|         216|
|         343|
|         512|
|         729|
+------------+



In [50]:
from pyspark.sql.types import DoubleType
spark.udf.register("go-go-go", lambda x: x**3*1.0, IntegerType())
spark.udf.register("go-go-go2", lambda x: x**3*1.0, StringType())
spark.udf.register("go-go-go3", lambda x: x**3*1.0, DoubleType())
print(tabulate(spark.range(10)
                    .selectExpr("`go-go-go`(id)", "`go-go-go2`(id)", "`go-go-go3`(id)").collect(), 
               ["Ints", "strings", "floats"]))

Ints      strings    floats
------  ---------  --------
                0         0
                1         1
                8         8
               27        27
               64        64
              125       125
              216       216
              343       343
              512       512
              729       729
