# Importing Relevant Libraries

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Creating a Spark Session

In [2]:
spark = SparkSession.builder.getOrCreate()

# Example Dataframe
# df = spark.sql("select 'spark' as hello ")
# df.show()

# Reading a CSV File with Headers

In [3]:
df = spark.read.format("csv").option("header", "true").load("product_info_sephora.csv")

#### Alternate Method
# df = spark.read.option("header", "true").csv("product_info_sephora.csv")

df.show(5)

+----------+--------------------+--------+----------+-----------+------+-------+--------------+--------------------+---------------+--------------+--------------------+---------+---------------+--------------+---------------+---+-----------+------------+-----------------+--------------------+----------------+------------------+-----------------+-----------+---------------+---------------+
|product_id|        product_name|brand_id|brand_name|loves_count|rating|reviews|          size|      variation_type|variation_value|variation_desc|         ingredients|price_usd|value_price_usd|sale_price_usd|limited_edition|new|online_only|out_of_stock|sephora_exclusive|          highlights|primary_category|secondary_category|tertiary_category|child_count|child_max_price|child_min_price|
+----------+--------------------+--------+----------+-----------+------+-------+--------------+--------------------+---------------+--------------+--------------------+---------+---------------+--------------+-------

# Dataframe (DF) View in Pandas

In [4]:
#### Only 5 Rows
df.limit(5).toPandas()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11,,,,...,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,85.0,30.0
2,P473662,Rainbow Bar Eau de Parfum,6342,19-69,3253,4.25,16,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
3,P473660,Kasbah Eau de Parfum,6342,19-69,3018,4.4762,21,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
4,P473658,Purple Haze Eau de Parfum,6342,19-69,2691,3.2308,13,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0


# Row Count

In [5]:
df.count()

8495

# Column Count

In [6]:
len(df.columns)

27

# List of columns

In [7]:
df.columns

['product_id',
 'product_name',
 'brand_id',
 'brand_name',
 'loves_count',
 'rating',
 'reviews',
 'size',
 'variation_type',
 'variation_value',
 'variation_desc',
 'ingredients',
 'price_usd',
 'value_price_usd',
 'sale_price_usd',
 'limited_edition',
 'new',
 'online_only',
 'out_of_stock',
 'sephora_exclusive',
 'highlights',
 'primary_category',
 'secondary_category',
 'tertiary_category',
 'child_count',
 'child_max_price',
 'child_min_price']

# Filtering dataframe based on a column

In [8]:
df.filter(F.col('reviews')<10).limit(5).toPandas()

#### Alternate Method

# df.filter(F.col('reviews')<10).show(5)

#### Alternate Method

# df.filter("reviews<10").limit(5).toPandas()

#### Alternate Method

# df.createOrReplaceTempView("df_table")
# df_filtered = spark.sql("select * from df_table where reviews<10")
# spark.catalog.dropTempView("df_table")  # To remove the temp view if not required further
# df_filtered.limit(5).toPandas()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
0,P473666,Invisible Post Eau de Parfum,6342,19-69,1542,3.625,8,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'allure 2022 Best...",Fragrance,Women,Perfume,2,75.0,30.0
1,P472300,Capri Eau de Parfum,6342,19-69,1542,3.5714,7,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL Eau de Parfum Spray,...,1,0,0,"['Fresh Scent', 'Layerable Scent', 'Unisex/ Ge...",Fragrance,Women,Perfume,2,75.0,30.0
2,P473667,Invisible Post Eau de Parfum Travel Spray,6342,19-69,1377,3.625,8,0.25 oz/ 7.5 mL,Size + Concentration + Formulation,0.25 oz/ 7.5 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'allure 2022 Best...",Fragrance,Women,Rollerballs & Travel Size,0,,
3,P473670,Capri Eau de Parfum Travel Spray,6342,19-69,1206,3.5714,7,0.25 oz/ 7.5 mL,Size + Concentration + Formulation,0.25 oz/ 7.5 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Rollerballs & Travel Size,0,,
4,P473664,L'air Barbes Eau de Parfum,6342,19-69,981,3.0,4,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL Eau de Parfum Spray,...,1,0,0,"['Fresh Scent', 'Unisex/ Genderless Scent', 'L...",Fragrance,Women,Perfume,2,75.0,30.0


# Filtering dataframe based on multiple columns

In [None]:
# Sorting a column

In [None]:
# Sorting multiple columns

In [None]:
# Filter Dataframe

In [None]:
# Filter Dataframe

In [None]:
# Filter Dataframe