#     Flipkart Data Engineering Project using Pyspark

In [0]:
#imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.functions import col,isnan,when,count
from pyspark.sql.functions import *

In [0]:
spark= SparkSession.builder.appName("Flipkart DE").getOrCreate()

In [0]:
#Load the datasets
file_path="/Volumes/workspace/de_project/dataset/Flipkart.csv"
flipkart_df=spark.read.csv(file_path,header=True,inferSchema=True)
flipkart_df.limit(10).display()

id,title,Rating,maincateg,platform,actprice1,norating1,noreviews1,star_5f,star_4f,star_3f,star_2f,star_1f,fulfilled1
2242,Casuals For Men (Blue),3.8,Men,Flipkart,999,27928,3543,14238,4295,3457,1962,3976,1
20532,Women Black Flats Sandal,3.9,Women,Flipkart,499,3015,404,1458,657,397,182,321,1
10648,Women Gold Wedges Sandal,3.9,Women,Flipkart,999,449,52,229,70,71,33,46,1
20677,Men's Height Increasing High Heel Formal Party Wear Slip-on Boots Slip On For Men (Tan),3.9,Men,Flipkart,2999,290,40,141,51,49,17,32,1
12593,Loafers For Men (Tan),3.9,Men,Flipkart,999,2423,326,1265,414,293,143,308,0
11159,Canvas Shoes For Men (Black),3.9,Men,Flipkart,999,541,72,281,104,69,17,70,0
10680,"Combo Pack Of 2 Casual Shoes Slip On Sneakers For Men (Blue, Grey)",3.9,Men,Flipkart,2400,824,105,386,199,106,61,72,1
6433,ARYA - DIFFERENT EDGE Bellies For Women (Purple),4.2,Women,Flipkart,4299,166,24,94,39,12,6,15,1
2794,"Latest Collection, Comfortable & Fashionable Bellies for Women's and Girl's Pack of 1 Bellies For Women (Pink, Grey)",3.9,Women,Flipkart,499,1816,218,899,360,239,113,208,0
11579,MOCCASSIN Slip On For Men (Black),3.8,Men,Flipkart,2299,3066,392,1304,803,462,189,308,1


In [0]:
#Checking the schema
flipkart_df.printSchema()
flipkart_df.describe().display()

root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- maincateg: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- actprice1: integer (nullable = true)
 |-- norating1: integer (nullable = true)
 |-- noreviews1: integer (nullable = true)
 |-- star_5f: integer (nullable = true)
 |-- star_4f: integer (nullable = true)
 |-- star_3f: integer (nullable = true)
 |-- star_2f: integer (nullable = true)
 |-- star_1f: integer (nullable = true)
 |-- fulfilled1: integer (nullable = true)



summary,id,title,Rating,maincateg,platform,actprice1,norating1,noreviews1,star_5f,star_4f,star_3f,star_2f,star_1f,fulfilled1
count,5244.0,5244,5041.0,5177,5244,5244.0,5244.0,5244.0,5176.0,5244.0,5244.0,5244.0,5058.0,5244.0
mean,10507.372616323417,0.0,4.011089069629038,,,1378.657894736842,2988.5800915331806,415.4910373760488,1557.443199381762,639.7854691075515,356.3567887109077,154.13996948893973,270.3977856860419,0.6045003813882532
stddev,5978.658891517549,,0.3019152228478203,,,1280.630070216534,12881.253714819482,1910.726669317451,6583.766997674783,2991.0652230817864,1632.732833888103,611.0067985620423,1035.0852878031526,0.4890043661095868
min,0.0,"""AADI MEN""""S BLACK NEW LOOK FORMAL BROGUES Derby For Men (Black)""",0.0,Men,Amazon,139.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,20964.0,Bellies For Women( beige stone) Bellies For Women (Beige),5.0,Women,Flipkart,15999.0,289973.0,45448.0,151193.0,74037.0,49924.0,12629.0,23139.0,1.0


In [0]:
#Handling missing data
flipkart_df.select([count(when(col(c).isNull(),c)).alias(c) for  c in flipkart_df.columns]).show()

##Drop the rows with missing values
flipkart_df_clean=flipkart_df.dropna()

#filling the missing values
flipkart_df_filled=flipkart_df.fillna({"Rating":0})

+---+-----+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+
| id|title|Rating|maincateg|platform|actprice1|norating1|noreviews1|star_5f|star_4f|star_3f|star_2f|star_1f|fulfilled1|
+---+-----+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+
|  0|    0|   203|       67|       0|        0|        0|         0|     68|      0|      0|      0|    186|         0|
+---+-----+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+



In [0]:
#Data Transformation
flipkart_df_transformed=flipkart_df.withColumn("EffectivePrice",expr("Price-(Price * Discount/100)"))


In [0]:
#Filling prducts with ratings greater than 4
high_rated_products=flipkart_df_filled.filter((col("Rating")>4))
high_rated_products.limit(10).display()

id,title,Rating,maincateg,platform,actprice1,norating1,noreviews1,star_5f,star_4f,star_3f,star_2f,star_1f,fulfilled1
6433,ARYA - DIFFERENT EDGE Bellies For Women (Purple),4.2,Women,Flipkart,4299,166,24,94,39,12,6,15,1
13859,Women Black Wedges Sandal,4.2,Women,Flipkart,279,3048,487,1746,679,348,114,161,1
19453,Denill Ankle Length Sneakers Sneakers For Women (Pink),4.1,Women,Flipkart,999,6806,961,3646,1508,810,335,510,1
8121,Women Grey Heels Sandal,4.2,Women,Flipkart,1990,16,1,10,2,1,3,0,1
9791,Pink Perfect Stylish Girls Casual Shoes Sneakers For Women (Pink),4.2,Women,Flipkart,999,509,73,288,106,61,17,37,1
7402,Concave 2 Wn s IDP Running Shoes For Women (Purple),4.3,Women,Flipkart,3999,367,47,206,108,30,6,18,1
3205,SFG-23 Slippers,4.1,Men,Flipkart,319,12936,1598,7030,2850,1453,587,1016,1
10607,Women Grey Sports Sandal,4.7,Women,Flipkart,1199,6,4,4,2,0,0,0,1
3659,"Women Grey, Orange Sports Sandal",4.3,Women,Flipkart,799,212,41,123,59,12,9,9,1
7625,"White casual, gym,training & Running shoes for men's Running Shoes For Men (White)",4.2,Men,Flipkart,999,2430,381,1472,397,251,105,205,0


In [0]:
#Group by the categiry and calculate avg rating
avg_rating_by_category=flipkart_df_filled.groupBy("maincateg").avg("Rating")
avg_rating_by_category = avg_rating_by_category.filter(col("maincateg").isNotNull())
avg_rating_by_category.display()

maincateg,avg(Rating)
Men,3.8350116550116575
Women,3.93901715039579


In [0]:
#Total Revenue by Category
total_revenue_by_category = flipkart_df_filled.filter("maincateg IS NOT NULL").groupBy("maincateg").agg(sum("Rating"))
total_revenue_by_category.display()

maincateg,sum(Rating)
Men,8226.100000000006
Women,11943.100000000037


In [0]:
#Save the processed data
output_table='Flipkart_Data_Analysis_Table'
flipkart_df_filled.write.mode("overwrite").saveAsTable(output_table)

In [0]:
%sql
select * from Flipkart_Data_Analysis_Table limit 20

id,title,Rating,maincateg,platform,actprice1,norating1,noreviews1,star_5f,star_4f,star_3f,star_2f,star_1f,fulfilled1
2242,Casuals For Men (Blue),3.8,Men,Flipkart,999,27928,3543,14238,4295,3457,1962,3976,1
20532,Women Black Flats Sandal,3.9,Women,Flipkart,499,3015,404,1458,657,397,182,321,1
10648,Women Gold Wedges Sandal,3.9,Women,Flipkart,999,449,52,229,70,71,33,46,1
20677,Men's Height Increasing High Heel Formal Party Wear Slip-on Boots Slip On For Men (Tan),3.9,Men,Flipkart,2999,290,40,141,51,49,17,32,1
12593,Loafers For Men (Tan),3.9,Men,Flipkart,999,2423,326,1265,414,293,143,308,0
11159,Canvas Shoes For Men (Black),3.9,Men,Flipkart,999,541,72,281,104,69,17,70,0
10680,"Combo Pack Of 2 Casual Shoes Slip On Sneakers For Men (Blue, Grey)",3.9,Men,Flipkart,2400,824,105,386,199,106,61,72,1
6433,ARYA - DIFFERENT EDGE Bellies For Women (Purple),4.2,Women,Flipkart,4299,166,24,94,39,12,6,15,1
2794,"Latest Collection, Comfortable & Fashionable Bellies for Women's and Girl's Pack of 1 Bellies For Women (Pink, Grey)",3.9,Women,Flipkart,499,1816,218,899,360,239,113,208,0
11579,MOCCASSIN Slip On For Men (Black),3.8,Men,Flipkart,2299,3066,392,1304,803,462,189,308,1
