In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Hands-on-1")
    .master("local[*]")
    .getOrCreate()
)

In [3]:
df_ratings = (
    spark
    .read
    .csv("../../data-sets/ml-latest/ratings.csv")
)

In [4]:
df_ratings.show(n=5) # you can provide n=5 for showing only the top 5 rows, as spark is lazy it will only load the top n rows to show. Also you can specify truncate=False to disable truncation of the data if it's too big to fit in the screen
df_ratings.printSchema()

+------+-------+------+----------+
|   _c0|    _c1|   _c2|       _c3|
+------+-------+------+----------+
|userId|movieId|rating| timestamp|
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
+------+-------+------+----------+
only showing top 5 rows

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



### As you can see, by default pyspark does not infer the schema of the data, and consider all of them as string. Also, by default it does not infer the header in the .csv files. For that we will need to pass some argument to .read.csv method of spark object

In [5]:
df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/ratings.csv",
        sep=",", # most .csv files are comma "," seperated. You can specify you own seperator symbol here
        header=True, # for inferring the header as 1st row,
        inferSchema=True, # for inferring the schema (i.e; the datatypes) of the dataframe
        quote='"', # most .csv files escape commas in data using " character, for anything else you can specify the value here
        encoding="UTF-8", # most .csv files are UTF-8 encoded. ISO-8859-1 is also very common
    )
)

In [6]:
df_ratings.show(n=5)
df_ratings.printSchema()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
+------+-------+------+----------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



### As you can see by providing proper arguments we were able to parse the csv file in the desired format. Note that inferSchema should only be used for adhoc analysis, in production we should always enforce schema on our files for type-safety.

In [7]:
df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/ratings.csv",
        sep=",", # most .csv files are comma "," seperated. You can specify you own seperator symbol here
        header=True, # for inferring the header as 1st row,
        # inferSchema=True, # for inferring the schema (i.e; the datatypes) of the dataframe
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
        quote='"', # most .csv files escape commas in data using " character, for anything else you can specify the value here
        encoding="UTF-8", # most .csv files are UTF-8 encoded. ISO-8859-1 is also very common
    )
)

In [8]:
df_ratings.show(n=5)
df_ratings.printSchema()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
+------+-------+------+----------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [9]:
spark.stop()