## Setup

1. create workspace
2. create lakehouse
3. upload files from explorer panel



## create table in lakehouse

1. from explorer panel, select file .csl and "load to table"

## set up the warehouse

1. create a werehouse
2. create fact table
3. create dim tables

        CREATE SCHEMA [Sales]
        GO
                
        IF OBJECT_ID('Sales.Fact_Sales', 'U') IS NULL
            CREATE TABLE Sales.Fact_Sales (
                CustomerID VARCHAR(255) NOT NULL,
                ItemID VARCHAR(255) NOT NULL,
                SalesOrderNumber VARCHAR(30),
                SalesOrderLineNumber INT,
                OrderDate DATE,
                Quantity INT,
                TaxAmount FLOAT,
                UnitPrice FLOAT
            );
            
        IF OBJECT_ID('Sales.Dim_Customer', 'U') IS NULL
            CREATE TABLE Sales.Dim_Customer (
                CustomerID VARCHAR(255) NOT NULL,
                CustomerName VARCHAR(255) NOT NULL,
                EmailAddress VARCHAR(255) NOT NULL
            );
                
        ALTER TABLE Sales.Dim_Customer add CONSTRAINT PK_Dim_Customer PRIMARY KEY NONCLUSTERED (CustomerID) NOT ENFORCED
        GO
            
        IF OBJECT_ID('Sales.Dim_Item', 'U') IS NULL
            CREATE TABLE Sales.Dim_Item (
                ItemID VARCHAR(255) NOT NULL,
                ItemName VARCHAR(255) NOT NULL
            );
                
        ALTER TABLE Sales.Dim_Item add CONSTRAINT PK_Dim_Item PRIMARY KEY NONCLUSTERED (ItemID) NOT ENFORCED
        GO

4. create view pointing at the lakehouse

        CREATE VIEW Sales.Staging_Sales
        AS
        SELECT * FROM [<your lakehouse name>].[dbo].[staging_sales];

* Keep the "[" "]" so warehouse connect to lakehouse



## load data to warehouse (using new PROCEDURE)

        CREATE OR ALTER PROCEDURE Sales.LoadDataFromStaging (@OrderYear INT)
        AS
        BEGIN
            -- Load data into the Customer dimension table
            INSERT INTO Sales.Dim_Customer (CustomerID, CustomerName, EmailAddress)
            SELECT DISTINCT CustomerName, CustomerName, EmailAddress
            FROM [Sales].[Staging_Sales]
            WHERE YEAR(OrderDate) = @OrderYear
            AND NOT EXISTS (
                SELECT 1
                FROM Sales.Dim_Customer
                WHERE Sales.Dim_Customer.CustomerName = Sales.Staging_Sales.CustomerName
                AND Sales.Dim_Customer.EmailAddress = Sales.Staging_Sales.EmailAddress
            );
                
            -- Load data into the Item dimension table
            INSERT INTO Sales.Dim_Item (ItemID, ItemName)
            SELECT DISTINCT Item, Item
            FROM [Sales].[Staging_Sales]
            WHERE YEAR(OrderDate) = @OrderYear
            AND NOT EXISTS (
                SELECT 1
                FROM Sales.Dim_Item
                WHERE Sales.Dim_Item.ItemName = Sales.Staging_Sales.Item
            );
                
            -- Load data into the Sales fact table
            INSERT INTO Sales.Fact_Sales (CustomerID, ItemID, SalesOrderNumber, SalesOrderLineNumber, OrderDate, Quantity, TaxAmount, UnitPrice)
            SELECT CustomerName, Item, SalesOrderNumber, CAST(SalesOrderLineNumber AS INT), CAST(OrderDate AS DATE), CAST(Quantity AS INT), CAST(TaxAmount AS FLOAT), CAST(UnitPrice AS FLOAT)
            FROM [Sales].[Staging_Sales]
            WHERE YEAR(OrderDate) = @OrderYear;
        END


### Use procedure to load data from year 2021 only

        EXEC Sales.LoadDataFromStaging 2021


## Run analytical queries


1. the customers by total sales for the year of 2021. 

        SELECT c.CustomerName, SUM(s.UnitPrice * s.Quantity) AS TotalSales
        FROM Sales.Fact_Sales s
        JOIN Sales.Dim_Customer c
        ON s.CustomerID = c.CustomerID
        WHERE YEAR(s.OrderDate) = 2021
        GROUP BY c.CustomerName
        ORDER BY TotalSales DESC;

2. the top-seliing items by total sales for the year of 2021

        SELECT i.ItemName, SUM(s.UnitPrice * s.Quantity) AS TotalSales
        FROM Sales.Fact_Sales s
        JOIN Sales.Dim_Item i
        ON s.ItemID = i.ItemID
        WHERE YEAR(s.OrderDate) = 2021
        GROUP BY i.ItemName
        ORDER BY TotalSales DESC;


3. the top customer for each of the categories: Bike, Helmet, and Gloves, based on their total sales.

        WITH CategorizedSales AS (
        SELECT
            CASE
                WHEN i.ItemName LIKE '%Helmet%' THEN 'Helmet'
                WHEN i.ItemName LIKE '%Bike%' THEN 'Bike'
                WHEN i.ItemName LIKE '%Gloves%' THEN 'Gloves'
                ELSE 'Other'
            END AS Category,
            c.CustomerName,
            s.UnitPrice * s.Quantity AS Sales
        FROM Sales.Fact_Sales s
        JOIN Sales.Dim_Customer c
        ON s.CustomerID = c.CustomerID
        JOIN Sales.Dim_Item i
        ON s.ItemID = i.ItemID
        WHERE YEAR(s.OrderDate) = 2021
        ),
        RankedSales AS (
            SELECT
                Category,
                CustomerName,
                SUM(Sales) AS TotalSales,
                ROW_NUMBER() OVER (PARTITION BY Category ORDER BY SUM(Sales) DESC) AS SalesRank
            FROM CategorizedSales
            WHERE Category IN ('Helmet', 'Bike', 'Gloves')
            GROUP BY Category, CustomerName
        )
        SELECT Category, CustomerName, TotalSales
        FROM RankedSales
        WHERE SalesRank = 1
        ORDER BY TotalSales DESC;


## Query a Warehouse from taxi ride dataset

1. the total number of trips and total revenue by month

 SELECT 
 D.MonthName, 
 COUNT(*) AS TotalTrips, 
 SUM(T.TotalAmount) AS TotalRevenue 
 FROM dbo.Trip AS T
 JOIN dbo.[Date] AS D
     ON T.[DateID]=D.[DateID]
 GROUP BY D.MonthName;

2. the average trip duration and distance by day of the week

SELECT 
 D.DayName, 
 AVG(T.TripDurationSeconds) AS AvgDuration, 
 AVG(T.TripDistanceMiles) AS AvgDistance 
 FROM dbo.Trip AS T
 JOIN dbo.[Date] AS D
     ON T.[DateID]=D.[DateID]
 GROUP BY D.DayName;

3.  the top 10 most popular pickup and dropoff locations.

 SELECT TOP 10 
     G.City, 
     COUNT(*) AS TotalTrips 
 FROM dbo.Trip AS T
 JOIN dbo.Geography AS G
     ON T.DropoffGeographyID=G.GeographyID
 GROUP BY G.City
 ORDER BY TotalTrips DESC;


## check data consistency

 -- Check for trips with unusually long duration
 SELECT COUNT(*) FROM dbo.Trip WHERE TripDurationSeconds > 86400; -- 24 hours


 -- Check for trips with negative trip duration
 SELECT COUNT(*) FROM dbo.Trip WHERE TripDurationSeconds < 0;


 -- Remove trips with negative trip duration
 DELETE FROM dbo.Trip WHERE TripDurationSeconds < 0;


## Save views for reports

1. query large data

 SELECT 
     D.DayName, 
     AVG(T.TripDurationSeconds) AS AvgDuration, 
     AVG(T.TripDistanceMiles) AS AvgDistance 
 FROM dbo.Trip AS T
 JOIN dbo.[Date] AS D
     ON T.[DateID]=D.[DateID]
 GROUP BY D.DayName;

2. filter the data to include only records from the month of January. 

 SELECT 
     D.DayName, 
     AVG(T.TripDurationSeconds) AS AvgDuration, 
     AVG(T.TripDistanceMiles) AS AvgDistance 
 FROM dbo.Trip AS T
 JOIN dbo.[Date] AS D
     ON T.[DateID]=D.[DateID]
 WHERE D.Month = 1
 GROUP BY D.DayName
