In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T

In [3]:
df = spark.read.format("csv").option("header","true").load("file:///D:/data/SalesLT.product.csv")

In [6]:
df.show(2)

+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|ProductID|                Name|ProductNumber|Color|StandardCost|ListPrice|Size| Weight|ProductCategoryID|ProductModelID|       SellStartDate|SellEndDate|DiscontinuedDate|ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|
+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|      680|HL Road Frame - B...|   FR-R92B-58|Black|   1059.3100|1431.5000|  58|1016.04|               18|             6|2002-06-01T00:00:...|       null|            null|   [B@7fa7208e|  no_image_availabl...|43dd68d6-14a4-461...|2008-03-11T10:

### Create temp veiw to run the spark sql 

In [7]:
sql_df = df.createOrReplaceTempView("sdf")

In [8]:
df.printSchema()

root
 |-- ProductID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- ProductNumber: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- StandardCost: string (nullable = true)
 |-- ListPrice: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- ProductCategoryID: string (nullable = true)
 |-- ProductModelID: string (nullable = true)
 |-- SellStartDate: string (nullable = true)
 |-- SellEndDate: string (nullable = true)
 |-- DiscontinuedDate: string (nullable = true)
 |-- ThumbNailPhoto: string (nullable = true)
 |-- ThumbnailPhotoFileName: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)



In [9]:
prod_model_filter_df = df.filter(col("ProductModelID") >= 6).show(3)

+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|ProductID|                Name|ProductNumber|Color|StandardCost|ListPrice|Size| Weight|ProductCategoryID|ProductModelID|       SellStartDate|SellEndDate|DiscontinuedDate|ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|
+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|      680|HL Road Frame - B...|   FR-R92B-58|Black|   1059.3100|1431.5000|  58|1016.04|               18|             6|2002-06-01T00:00:...|       null|            null|   [B@7fa7208e|  no_image_availabl...|43dd68d6-14a4-461...|2008-03-11T10:

### Changing the above code in SparkSQL

prod_model_filter_sdf = spark.sql("select * from sdf where ProductModelID >= 6").show(2)

In [33]:
prod_model_filter_sdf = spark.sql("select * from sdf where ProductModelID >= 6").show(2)

+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|ProductID|                Name|ProductNumber|Color|StandardCost|ListPrice|Size| Weight|ProductCategoryID|ProductModelID|       SellStartDate|SellEndDate|DiscontinuedDate|ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|
+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|      680|HL Road Frame - B...|   FR-R92B-58|Black|   1059.3100|1431.5000|  58|1016.04|               18|             6|2002-06-01T00:00:...|       null|            null|   [B@7fa7208e|  no_image_availabl...|43dd68d6-14a4-461...|2008-03-11T10:

### renamed the column

In [11]:
renamed_col_df = df.withColumnRenamed("ProductModelID", "ProductModel_ID").show(2)

+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+---------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|ProductID|                Name|ProductNumber|Color|StandardCost|ListPrice|Size| Weight|ProductCategoryID|ProductModel_ID|       SellStartDate|SellEndDate|DiscontinuedDate|ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|
+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+---------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|      680|HL Road Frame - B...|   FR-R92B-58|Black|   1059.3100|1431.5000|  58|1016.04|               18|              6|2002-06-01T00:00:...|       null|            null|   [B@7fa7208e|  no_image_availabl...|43dd68d6-14a4-461...|2008-03-11

### Changing the above code in SparkSQL

In [15]:
renamed_col_sdf = spark.sql("alter table sdf rename COLUMN  ProductModel_ID to ProductModelID").show(2)

In [18]:
spark.sql("select * from sdf limit 2").show(2)

+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|ProductID|                Name|ProductNumber|Color|StandardCost|ListPrice|Size| Weight|ProductCategoryID|ProductModelID|       SellStartDate|SellEndDate|DiscontinuedDate|ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|
+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+
|      680|HL Road Frame - B...|   FR-R92B-58|Black|   1059.3100|1431.5000|  58|1016.04|               18|             6|2002-06-01T00:00:...|       null|            null|   [B@7fa7208e|  no_image_availabl...|43dd68d6-14a4-461...|2008-03-11T10:

### Use condition statement

In [21]:


condition_df = df.withColumn("Conditional Syandard Column",
    when(col("StandardCost") <= 500, "low standard")
        .when(col("StandardCost") <= 1000, "medium standard")
        .when(col("StandardCost") <= 2001, "High Standard")
        .otherwise("Very High Standard")
).show()

+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+--------------------+----------------+--------------+----------------------+--------------------+--------------------+---------------------------+
|ProductID|                Name|ProductNumber|Color|StandardCost|ListPrice|Size| Weight|ProductCategoryID|ProductModelID|       SellStartDate|         SellEndDate|DiscontinuedDate|ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|Conditional Syandard Column|
+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+--------------------+----------------+--------------+----------------------+--------------------+--------------------+---------------------------+
|      680|HL Road Frame - B...|   FR-R92B-58|Black|   1059.3100|1431.5000|  58|1016.04|               18|             6|2002-06-01T0

### converting string to double

In [22]:

df.printSchema()


root
 |-- ProductID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- ProductNumber: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- StandardCost: string (nullable = true)
 |-- ListPrice: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- ProductCategoryID: string (nullable = true)
 |-- ProductModelID: string (nullable = true)
 |-- SellStartDate: string (nullable = true)
 |-- SellEndDate: string (nullable = true)
 |-- DiscontinuedDate: string (nullable = true)
 |-- ThumbNailPhoto: string (nullable = true)
 |-- ThumbnailPhotoFileName: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)



In [24]:
string_double_df = df.withColumn("new_productModelID",col("ProductModelID").cast("double"))

In [26]:
string_double_df.show(3)

+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+------------------+
|ProductID|                Name|ProductNumber|Color|StandardCost|ListPrice|Size| Weight|ProductCategoryID|ProductModelID|       SellStartDate|SellEndDate|DiscontinuedDate|ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|new_productModelID|
+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+-----------+----------------+--------------+----------------------+--------------------+--------------------+------------------+
|      680|HL Road Frame - B...|   FR-R92B-58|Black|   1059.3100|1431.5000|  58|1016.04|               18|             6|2002-06-01T00:00:...|       null|            null|   [B@7fa7208e| 

### Now You can see the new_productModelID as Double column data type

In [27]:
string_double_df.printSchema()

root
 |-- ProductID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- ProductNumber: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- StandardCost: string (nullable = true)
 |-- ListPrice: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- ProductCategoryID: string (nullable = true)
 |-- ProductModelID: string (nullable = true)
 |-- SellStartDate: string (nullable = true)
 |-- SellEndDate: string (nullable = true)
 |-- DiscontinuedDate: string (nullable = true)
 |-- ThumbNailPhoto: string (nullable = true)
 |-- ThumbnailPhotoFileName: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)
 |-- new_productModelID: double (nullable = true)



### Convert String to INT

In [28]:
string_int_df = df.withColumn("ListPrice", col("ListPrice").cast("int"))

In [29]:
string_int_df.printSchema()

root
 |-- ProductID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- ProductNumber: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- StandardCost: string (nullable = true)
 |-- ListPrice: integer (nullable = true)
 |-- Size: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- ProductCategoryID: string (nullable = true)
 |-- ProductModelID: string (nullable = true)
 |-- SellStartDate: string (nullable = true)
 |-- SellEndDate: string (nullable = true)
 |-- DiscontinuedDate: string (nullable = true)
 |-- ThumbNailPhoto: string (nullable = true)
 |-- ThumbnailPhotoFileName: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)



### Get size of dataframe

In [30]:
print("{} rows".format(df.count()))
print("{} columns".format(len(df.columns)))

295 rows
17 columns


### Get Number of Partitions

In [31]:
print("{} partition(s)".format(df.rdd.getNumPartitions()))

1 partition(s)


### Get data types of a DataFrame's columns

In [32]:
print(df.dtypes)

[('ProductID', 'string'), ('Name', 'string'), ('ProductNumber', 'string'), ('Color', 'string'), ('StandardCost', 'string'), ('ListPrice', 'string'), ('Size', 'string'), ('Weight', 'string'), ('ProductCategoryID', 'string'), ('ProductModelID', 'string'), ('SellStartDate', 'string'), ('SellEndDate', 'string'), ('DiscontinuedDate', 'string'), ('ThumbNailPhoto', 'string'), ('ThumbnailPhotoFileName', 'string'), ('rowguid', 'string'), ('ModifiedDate', 'string')]
