# Coding Challenge DEC 12 - Sivaprakash V

## Initiate Session

In [0]:
from pyspark.sql import SparkSession
# Initialize Spark Session
spark = SparkSession.builder.appName("Coding").getOrCreate()

## Sample Dataframe creation

In [0]:
# Sample student data
data = [
    (1, "Alice", 20, "A"),
    (2, "Bob", 22, "B"),
    (3, "Catherine", 21, "A"),
    (4, "David", 23, "C"),
    (5, "Eve", 20, "B")
]

# Define the schema
columns = ["StudentID", "Name", "Age", "Grade"]

# Create DataFrame
student_df = spark.createDataFrame(data, schema=columns)

# Show the DataFrame
student_df.show()

+---------+---------+---+-----+
|StudentID|     Name|Age|Grade|
+---------+---------+---+-----+
|        1|    Alice| 20|    A|
|        2|      Bob| 22|    B|
|        3|Catherine| 21|    A|
|        4|    David| 23|    C|
|        5|      Eve| 20|    B|
+---------+---------+---+-----+



## Extraction

In [0]:
# Define the file path
file_path = "/FileStore/tables/sales.csv"
# Read the CSV file
df = spark.read.csv(file_path, header=True, inferSchema=True)

## Exploratory Data Analysis

In [0]:
# Type of df
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [0]:
# total records in df
print(df.count())

189


In [0]:
#Columns in df
print(df.columns)

['Date', 'Day', 'Month', 'Year', 'Customer_Age', 'Age_Group', 'Customer_Gender', 'Country', 'State', 'Product_Category', 'Sub_Category', 'Product', 'Order_Quantity', 'Unit_Cost', 'Unit_Price', 'Profit', 'Cost', 'Revenue']


In [0]:
# df schema
print(df.printSchema())

root
 |-- Date: date (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Customer_Age: integer (nullable = true)
 |-- Age_Group: string (nullable = true)
 |-- Customer_Gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Sub_Category: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Order_Quantity: integer (nullable = true)
 |-- Unit_Cost: integer (nullable = true)
 |-- Unit_Price: integer (nullable = true)
 |-- Profit: integer (nullable = true)
 |-- Cost: integer (nullable = true)
 |-- Revenue: integer (nullable = true)

None


## Transformations & Visualizations

### Revenue Trend Over Time

In [0]:
revenue_trend = df.groupBy("Year", "Month").agg({"Revenue": "sum"}).withColumnRenamed("sum(Revenue)", "Total_Revenue")
revenue_trend.display()

Year,Month,Total_Revenue
2014,July,1947
2013,August,11020
2014,May,13718
2013,December,4933
2016,June,5107
2016,May,13729
2016,March,8952
2014,June,5288
2013,July,8161
2013,September,10904


Databricks visualization. Run in Databricks to view.

### Customer Age Group Distribution

In [0]:
age_group_distribution = df.groupBy("Age_Group").count()
age_group_distribution.display()

Age_Group,count
Youth (<25),16
Adults (35-64),101
Seniors (64+),4
Young Adults (25-34),68


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

### Country-wise Revenue Analysis

In [0]:
revenue_by_country = df.groupBy("Country").agg({"Revenue": "sum"}).withColumnRenamed("sum(Revenue)", "Total_Revenue")
revenue_by_country.display()

Country,Total_Revenue
Germany,9703
France,10334
United States,91811
Canada,38136
Australia,23119
United Kingdom,4147


Databricks visualization. Run in Databricks to view.

### Gender-based Revenue Analysis

In [0]:
gender_revenue = df.groupBy("Customer_Gender").agg({"Revenue": "sum"}).withColumnRenamed("sum(Revenue)", "Total_Revenue")
gender_revenue.display()

Customer_Gender,Total_Revenue
F,99555
M,77695


Databricks visualization. Run in Databricks to view.

### Remove date, day and customer age columns and drop duplicates

In [0]:
# Drop the specified columns
columns_to_drop = ["Date", "Day", "Customer_Age"]
transformed_df = df.drop(*columns_to_drop)

# Drop duplicate records from the DataFrame
deduplicated_df = transformed_df.dropDuplicates()

In [0]:
deduplicated_df.printSchema()

root
 |-- Month: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Age_Group: string (nullable = true)
 |-- Customer_Gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Sub_Category: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Order_Quantity: integer (nullable = true)
 |-- Unit_Cost: integer (nullable = true)
 |-- Unit_Price: integer (nullable = true)
 |-- Profit: integer (nullable = true)
 |-- Cost: integer (nullable = true)
 |-- Revenue: integer (nullable = true)



### in spark sql

In [0]:
# View 
deduplicated_df.createOrReplaceTempView("df")

### Total Revenue and Profit by State

In [0]:
%sql
SELECT 
    State, 
    SUM(Revenue) AS Total_Revenue, 
    SUM(Profit) AS Total_Profit
FROM df
GROUP BY State
ORDER BY Total_Revenue DESC;


State,Total_Revenue,Total_Profit
California,61036,37681
British Columbia,37898,23543
Oregon,16613,9368
Washington,13946,7241
New South Wales,9605,5465
Victoria,7869,4134
Hessen,6361,3796
Moselle,5808,3333
Queensland,5645,3125
England,4147,2527


Databricks visualization. Run in Databricks to view.

### Average Order Quantity by Age Group and Gender

In [0]:
%sql
SELECT 
    Age_Group, 
    Customer_Gender, 
    AVG(Order_Quantity) AS Avg_Order_Quantity
FROM df
GROUP BY Age_Group, Customer_Gender
ORDER BY Avg_Order_Quantity DESC;


Age_Group,Customer_Gender,Avg_Order_Quantity
Adults (35-64),F,10.75
Young Adults (25-34),F,8.84375
Adults (35-64),M,8.340425531914894
Young Adults (25-34),M,7.916666666666667
Youth (<25),M,7.25
Youth (<25),F,4.166666666666667
Seniors (64+),F,2.0


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

### Low-Performing States

In [0]:
%sql
SELECT 
    State, 
    SUM(Revenue) AS Total_Revenue
FROM df
GROUP BY State
HAVING SUM(Revenue) < 10000
ORDER BY Total_Revenue ASC;


State,Total_Revenue
Kentucky,216
Nordrhein-Westfalen,522
Hamburg,558
Loir et Cher,1282
Seine Saint Denis,1574
Nord,1670
Saarland,2262
England,4147
Queensland,5645
Moselle,5808


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    CASE 
        WHEN Month IN ('December', 'January', 'February') THEN 'Winter'
        WHEN Month IN ('March', 'April', 'May') THEN 'Spring'
        WHEN Month IN ('June', 'July', 'August') THEN 'Summer'
        ELSE 'Autumn'
    END AS Season, 
    Product, 
    SUM(Revenue) AS Total_Revenue
FROM df
GROUP BY Season, Product
ORDER BY Season, Total_Revenue DESC;


Season,Product,Total_Revenue
Autumn,Hitch Rack - 4-Bike,44989
Spring,Hitch Rack - 4-Bike,48991
Summer,Hitch Rack - 4-Bike,49836
Winter,Hitch Rack - 4-Bike,33196


Databricks visualization. Run in Databricks to view.

## Load

In [0]:
# Load the transfomed- deduplicated data into delta table
deduplicated_df.write.format("delta").mode("overwrite").saveAsTable("default.deduplicated_table")


# Thank you
#_Sivaprakash V_