In [2]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_most_interviewed").getOrCreate()
table_schema = t.StructType([
    t.StructField("interviwer_id", t.StringType(), False),
    t.StructField("occupation_id", t.StringType(), False),
    t.StructField("rating", t.IntegerType(), False)])

csv_file_path = "file:///home/jovyan/work/sample/like.csv"
df = spark.read.schema(table_schema).csv(csv_file_path)

interviewer_count = df.groupBy("occupation_id").count().orderBy(f.desc("count"))

for d in interviewer_count.select("occupation_id", f.col("count").alias("cnt")).collect():
    print(f"{d.occupation_id}: {d.cnt}")


# But, What if we want to know what occupation_id is?  
# 1100: engineer
# 2030: developer
# 3801: painter
# 3021: chemistry teacher
# 9382: priest

meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}
occupation_dict = spark.sparkContext.broadcast(meta)

def get_occupation_name(occupation_id: str) -> str:
    return occupation_dict.value[occupation_id]

occupation_lookup_udf = f.udf(get_occupation_name)

occupation_with_name = interviewer_count.withColumn("occupation_name", occupation_lookup_udf(f.col("occupation_id")))

occupation_with_name.show(10)

1100: 217
3801: 203
2030: 200
3021: 191
9382: 189
+-------------+-----+-----------------+
|occupation_id|count|  occupation_name|
+-------------+-----+-----------------+
|         1100|  217|         engineer|
|         3801|  203|          painter|
|         2030|  200|        developer|
|         3021|  191|chemistry teacher|
|         9382|  189|           priest|
+-------------+-----+-----------------+



In [3]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_most_interviewed").getOrCreate()
table_schema = t.StructType([
    t.StructField("interviwer_id", t.StringType(), False),
    t.StructField("occupation_id", t.StringType(), False),
    t.StructField("rating", t.IntegerType(), False)])

csv_file_path = "file:///home/jovyan/work/sample/like.csv"
df = spark.read.schema(table_schema).csv(csv_file_path)

df.createOrReplaceTempView("like")

spark.sql(
    "select occupation_id, count(*) from like group by 1 order by count(*) desc ").show()



+-------------+--------+
|occupation_id|count(1)|
+-------------+--------+
|         1100|     217|
|         3801|     203|
|         2030|     200|
|         3021|     191|
|         9382|     189|
+-------------+--------+



In [6]:

meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}

dic = spark.sparkContext.broadcast(meta)


dic.value["1100"]

'engineer'

In [None]:
meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}
occupation_dict = spark.sparkContext.broadcast(meta)

def get_occupation_name(occupation_id: str) -> str:
    return occupation_dict.value[occupation_id]

occupation_lookup_udf = f.udf(get_occupation_name)

occupation_with_name = interviewer_count.withColumn("occupation_name", occupation_lookup_udf(f.col("occupation_id")))

occupation_with_name.show(10)

In [13]:
df.show()

+-------------+-------------+------+
|interviwer_id|occupation_id|rating|
+-------------+-------------+------+
|        11657|         1100|     8|
|        13727|         2030|     2|
|        59892|         3801|     1|
|         6538|         3021|     6|
|        95811|         2030|     9|
|        54500|         1100|    10|
|        69741|         2030|     3|
|        51166|         2030|    10|
|        70009|         9382|     5|
|        63152|         2030|     6|
|        70758|         1100|     2|
|        35580|         2030|     5|
|        63199|         1100|    10|
|        33078|         2030|     3|
|        97480|         9382|     2|
|        47223|         1100|     8|
|        80308|         3021|     8|
|        26691|         1100|     3|
|        17194|         3021|     3|
|        96584|         2030|     4|
+-------------+-------------+------+
only showing top 20 rows



TypeError: 'Column' object is not callable

In [24]:
from  pyspark.sql import functions as f

meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}


dic = spark.sparkContext.broadcast(meta)



def get_id_name ( id : str ) -> str:
    return dic.value[id]


udf = f.udf(get_id_name)

df.withColumn("occupation_name", udf(f.col("occupation_id"))).show()


+-------------+-------------+------+-----------------+
|interviwer_id|occupation_id|rating|  occupation_name|
+-------------+-------------+------+-----------------+
|        11657|         1100|     8|         engineer|
|        13727|         2030|     2|        developer|
|        59892|         3801|     1|          painter|
|         6538|         3021|     6|chemistry teacher|
|        95811|         2030|     9|        developer|
|        54500|         1100|    10|         engineer|
|        69741|         2030|     3|        developer|
|        51166|         2030|    10|        developer|
|        70009|         9382|     5|           priest|
|        63152|         2030|     6|        developer|
|        70758|         1100|     2|         engineer|
|        35580|         2030|     5|        developer|
|        63199|         1100|    10|         engineer|
|        33078|         2030|     3|        developer|
|        97480|         9382|     2|           priest|
|        4