### Fixing issues in our data

In [9]:
from pyspark.sql import SparkSession, functions as f

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Hands-on-2")
    .master("local[*]")
    .getOrCreate()
)

### Loading ratings dataset using proper arguments

In [33]:
df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/ratings.csv",
        encoding="UTF-8",
        sep=",",
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
        header=True,
    )
)

In [34]:
df_ratings.show(n=5)
df_ratings.printSchema()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
+------+-------+------+----------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



### pyspark.sql.functions module contain alot of functions that can be used for operating on our data, eg- functions.from_unixtime, functions.to_timestamp etc.

In [35]:
# from the documentation (README.txt) of ml-latest dataset, we can see that the timestamp column has values in the unix posix format, i.e; the number of seconds from 1st July 1970. We will need to change it to spark timestamp for proper analysis

# first we are going to rename the timestamp column to timestamp_unix
df_ratings_renamed = df_ratings.withColumnRenamed("timestamp", "timestamp_unix")

df_ratings_renamed.show(n=5)
df_ratings_renamed.printSchema()

+------+-------+------+--------------+
|userId|movieId|rating|timestamp_unix|
+------+-------+------+--------------+
|     1|    307|   3.5|    1256677221|
|     1|    481|   3.5|    1256677456|
|     1|   1091|   1.5|    1256677471|
|     1|   1257|   4.5|    1256677460|
|     1|   1449|   4.5|    1256677264|
+------+-------+------+--------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp_unix: integer (nullable = true)



In [36]:
# now we are going to create a new column called timestamp from applying f.from_unixtime to timestamp_unix. This will give string representation of timestamp_unix
df_ratings_with_timestamp_as_string = (
    df_ratings_renamed
    .withColumn("timestamp", f.from_unixtime("timestamp_unix"))
)

df_ratings_with_timestamp_as_string.show(n=5)
df_ratings_with_timestamp_as_string.printSchema()

+------+-------+------+--------------+-------------------+
|userId|movieId|rating|timestamp_unix|          timestamp|
+------+-------+------+--------------+-------------------+
|     1|    307|   3.5|    1256677221|2009-10-28 02:30:21|
|     1|    481|   3.5|    1256677456|2009-10-28 02:34:16|
|     1|   1091|   1.5|    1256677471|2009-10-28 02:34:31|
|     1|   1257|   4.5|    1256677460|2009-10-28 02:34:20|
|     1|   1449|   4.5|    1256677264|2009-10-28 02:31:04|
+------+-------+------+--------------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp_unix: integer (nullable = true)
 |-- timestamp: string (nullable = true)



In [37]:
# now we are going to create a new column called timestamp from applying f.to_timestamp to timestamp (essentially replacing the timestamp column). This will give timestamp representation of timestamp column
df_ratings_with_timestamp_as_timestamp = (
    df_ratings_with_timestamp_as_string
    .withColumn("timestamp", f.to_timestamp("timestamp"))
)

df_ratings_with_timestamp_as_timestamp.show(n=5)
df_ratings_with_timestamp_as_timestamp.printSchema()

+------+-------+------+--------------+-------------------+
|userId|movieId|rating|timestamp_unix|          timestamp|
+------+-------+------+--------------+-------------------+
|     1|    307|   3.5|    1256677221|2009-10-28 02:30:21|
|     1|    481|   3.5|    1256677456|2009-10-28 02:34:16|
|     1|   1091|   1.5|    1256677471|2009-10-28 02:34:31|
|     1|   1257|   4.5|    1256677460|2009-10-28 02:34:20|
|     1|   1449|   4.5|    1256677264|2009-10-28 02:31:04|
+------+-------+------+--------------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp_unix: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [38]:
# We can chain all this operation in a single line, as spark dataframes are immutable. This makes the code more readable and easy to understand.

df_final = (
    df_ratings
    .withColumnRenamed("timestamp", "timestamp_unix")
    .withColumn("timestamp", f.from_unixtime("timestamp_unix"))
    .withColumn("timestamp", f.to_timestamp("timestamp"))
)

df_final.show(n=5)
df_final.printSchema()

+------+-------+------+--------------+-------------------+
|userId|movieId|rating|timestamp_unix|          timestamp|
+------+-------+------+--------------+-------------------+
|     1|    307|   3.5|    1256677221|2009-10-28 02:30:21|
|     1|    481|   3.5|    1256677456|2009-10-28 02:34:16|
|     1|   1091|   1.5|    1256677471|2009-10-28 02:34:31|
|     1|   1257|   4.5|    1256677460|2009-10-28 02:34:20|
|     1|   1449|   4.5|    1256677264|2009-10-28 02:31:04|
+------+-------+------+--------------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp_unix: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [39]:
# we can one step further and combine f.from_unix and f.to_timestamp together, as they are composable (thx to functional-programming)

df_final = (
    df_ratings
    .withColumnRenamed("timestamp", "timestamp_unix")
    .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp_unix")))
)

df_final.show(n=5)
df_final.printSchema()

+------+-------+------+--------------+-------------------+
|userId|movieId|rating|timestamp_unix|          timestamp|
+------+-------+------+--------------+-------------------+
|     1|    307|   3.5|    1256677221|2009-10-28 02:30:21|
|     1|    481|   3.5|    1256677456|2009-10-28 02:34:16|
|     1|   1091|   1.5|    1256677471|2009-10-28 02:34:31|
|     1|   1257|   4.5|    1256677460|2009-10-28 02:34:20|
|     1|   1449|   4.5|    1256677264|2009-10-28 02:31:04|
+------+-------+------+--------------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp_unix: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [40]:
# We can go one more step to combine all this transformation as part of spark.read.csv

df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/ratings.csv",
        encoding="UTF-8",
        sep=",",
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
        header=True,
    )
    .withColumnRenamed("timestamp", "timestamp_unix")
    .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp_unix")))
)

# This is going to make the code less verbose and easy to understand

In [41]:
df_ratings.show(n=5)
df_ratings.printSchema()

+------+-------+------+--------------+-------------------+
|userId|movieId|rating|timestamp_unix|          timestamp|
+------+-------+------+--------------+-------------------+
|     1|    307|   3.5|    1256677221|2009-10-28 02:30:21|
|     1|    481|   3.5|    1256677456|2009-10-28 02:34:16|
|     1|   1091|   1.5|    1256677471|2009-10-28 02:34:31|
|     1|   1257|   4.5|    1256677460|2009-10-28 02:34:20|
|     1|   1449|   4.5|    1256677264|2009-10-28 02:31:04|
+------+-------+------+--------------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp_unix: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [42]:
# for our dataset timestamp_unix is useless, and let's say we have to remove it. We can do so by using .drop method of spark dataframes

df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/ratings.csv",
        encoding="UTF-8",
        sep=",",
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
        header=True,
    )
    .withColumnRenamed("timestamp", "timestamp_unix")
    .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp_unix")))
    # again we can keep it as part of the spark.read.csv chain
    .drop("timestamp_unix")
)

In [43]:
df_ratings.show(n=5)
df_ratings.printSchema()

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    307|   3.5|2009-10-28 02:30:21|
|     1|    481|   3.5|2009-10-28 02:34:16|
|     1|   1091|   1.5|2009-10-28 02:34:31|
|     1|   1257|   4.5|2009-10-28 02:34:20|
|     1|   1449|   4.5|2009-10-28 02:31:04|
+------+-------+------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [44]:
# one thing to note here is that, if the column does not exist in the dataframe. It will not do anything, eg-

df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/ratings.csv",
        encoding="UTF-8",
        sep=",",
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
        header=True,
    )
    .withColumnRenamed("timestamp", "timestamp_unix")
    .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp_unix")))
    # foobar does not exist, nothing breaks
    .drop("timestamp_unix", "foobar")
)

In [45]:
df_ratings.show(n=5)
df_ratings.printSchema()

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    307|   3.5|2009-10-28 02:30:21|
|     1|    481|   3.5|2009-10-28 02:34:16|
|     1|   1091|   1.5|2009-10-28 02:34:31|
|     1|   1257|   4.5|2009-10-28 02:34:20|
|     1|   1449|   4.5|2009-10-28 02:31:04|
+------+-------+------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [46]:
spark.stop()