In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession
        .builder
        .appName("Expr")
        .getOrCreate()
)

23/05/09 15:00:24 WARN Utils: Your hostname, wedivv-H110M-S2V resolves to a loopback address: 127.0.1.1; using 192.168.1.44 instead (on interface wlp5s0)
23/05/09 15:00:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/09 15:00:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/09 15:00:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
from pyspark.sql.types import *

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("url", StringType(), True),
    StructField("published_date", DateType(), True),
    StructField("hits", IntegerType(), True),
    StructField("campaigns", ArrayType(StringType()), True)
])

In [4]:
blogsDF = spark.read.schema(schema).option("multiline","true").json("./data/2-BlogData.json")
blogsDF.show(truncate=False)

+---+----------+---------+----------------------------------+--------------+-----+---------------------------+
|id |first_name|last_name|url                               |published_date|hits |campaigns                  |
+---+----------+---------+----------------------------------+--------------+-----+---------------------------+
|1  |John      |Doe      |https://tinyurl.com/john-doe      |2022-01-01    |10000|[google, twitter]          |
|2  |Jane      |Doe      |https://tinyurl.com/jane-doe      |2022-01-02    |20000|[google, facebook]         |
|3  |Bob       |Smith    |https://tinyurl.com/bob-smith     |2022-01-03    |15000|[twitter, linkedin]        |
|4  |Alice     |Johnson  |https://tinyurl.com/alice-johnson |2022-01-04    |12000|[facebook]                 |
|5  |Charlie   |Garcia   |https://tinyurl.com/charlie-garcia|2022-01-05    |18000|[google, twitter, linkedin]|
+---+----------+---------+----------------------------------+--------------+-----+---------------------------+



In [5]:
from pyspark.sql.functions import *

full name and it length

In [6]:

blogsDF.select(concat_ws(" ", blogsDF["first_name"], blogsDF["last_name"]).alias("full_name"), length(concat_ws(" ", blogsDF["first_name"], blogsDF["last_name"])).alias("name_length")).show()


+--------------+-----------+
|     full_name|name_length|
+--------------+-----------+
|      John Doe|          8|
|      Jane Doe|          8|
|     Bob Smith|          9|
| Alice Johnson|         13|
|Charlie Garcia|         14|
+--------------+-----------+



In [7]:
(blogsDF
    .withColumn("AuthorsID", 
                concat(
                    col("first_name"),
                    lit("-"),
                    col("id"),
                    col("last_name"),
                    expr("CAST(FLOOR(RAND() * (100 - LENGTH(first_name))) AS INT) + LENGTH(first_name)"),

                )
    )
    .select("AuthorsID")
    .show()
)


+-----------------+
|        AuthorsID|
+-----------------+
|      John-1Doe97|
|      Jane-2Doe63|
|     Bob-3Smith15|
| Alice-4Johnson86|
|Charlie-5Garcia95|
+-----------------+



col

In [8]:
blogsDF.select((col("HITS") * 0.87).alias("nerfed_hits")).show(5)

+-----------+
|nerfed_hits|
+-----------+
|     8700.0|
|    17400.0|
|    13050.0|
|    10440.0|
|    15660.0|
+-----------+



expression

In [9]:
blogsDF.select(expr("HITS * 0.87").alias("nerfed_hits")).show(5)

+-----------+
|nerfed_hits|
+-----------+
|    8700.00|
|   17400.00|
|   13050.00|
|   10440.00|
|   15660.00|
+-----------+



In [10]:
blogsDF.withColumn("Big Hitters", (expr("Hits > 13000"))).show()

+---+----------+---------+--------------------+--------------+-----+--------------------+-----------+
| id|first_name|last_name|                 url|published_date| hits|           campaigns|Big Hitters|
+---+----------+---------+--------------------+--------------+-----+--------------------+-----------+
|  1|      John|      Doe|https://tinyurl.c...|    2022-01-01|10000|   [google, twitter]|      false|
|  2|      Jane|      Doe|https://tinyurl.c...|    2022-01-02|20000|  [google, facebook]|       true|
|  3|       Bob|    Smith|https://tinyurl.c...|    2022-01-03|15000| [twitter, linkedin]|       true|
|  4|     Alice|  Johnson|https://tinyurl.c...|    2022-01-04|12000|          [facebook]|      false|
|  5|   Charlie|   Garcia|https://tinyurl.c...|    2022-01-05|18000|[google, twitter,...|       true|
+---+----------+---------+--------------------+--------------+-----+--------------------+-----------+



In [11]:
blogsDF.select(expr("Hits")).show(2)
blogsDF.select(col("Hits")).show(2)
blogsDF.select("Hits").show(2)

+-----+
| Hits|
+-----+
|10000|
|20000|
+-----+
only showing top 2 rows

+-----+
| Hits|
+-----+
|10000|
|20000|
+-----+
only showing top 2 rows

+-----+
| Hits|
+-----+
|10000|
|20000|
+-----+
only showing top 2 rows



In [19]:
blogsDF.sort(desc("hits")).show()

+---+----------+---------+--------------------+--------------+-----+--------------------+
| id|first_name|last_name|                 url|published_date| hits|           campaigns|
+---+----------+---------+--------------------+--------------+-----+--------------------+
|  2|      Jane|      Doe|https://tinyurl.c...|    2022-01-02|20000|  [google, facebook]|
|  5|   Charlie|   Garcia|https://tinyurl.c...|    2022-01-05|18000|[google, twitter,...|
|  3|       Bob|    Smith|https://tinyurl.c...|    2022-01-03|15000| [twitter, linkedin]|
|  4|     Alice|  Johnson|https://tinyurl.c...|    2022-01-04|12000|          [facebook]|
|  1|      John|      Doe|https://tinyurl.c...|    2022-01-01|10000|   [google, twitter]|
+---+----------+---------+--------------------+--------------+-----+--------------------+

