In [89]:
import load_csv
df = load_csv.load("test_data/test.csv")

In [90]:
from pyspark.sql.functions import countDistinct, col, sum, row_number, when, count, year, month, quarter, round, lag, concat, lit, desc, format_number
from pyspark.sql import Window
from pathlib import Path

In [111]:
from pathlib import Path

# Define the path for saving files
base_path = Path("data_analysis_and_cleaner/data/matt")
base_path.mkdir(parents=True, exist_ok=True)

# Define a function to save a DataFrame to the specified path
def save_dataframe_to_csv(df, filename):
    file_path = base_path / filename
    # Save the DataFrame to a CSV file, overwriting any existing file with the same name
    df.write.mode("overwrite").csv('file://' + str(file_path.absolute()), header=True)


In [55]:
print(df.columns)

['order_id', 'customer_id', 'customer_name', 'product_id', 'product_name', 'product_category', 'payment_type', 'qty', 'price', 'datetime', 'country', 'city', 'ecommerce_website_name', 'payment_txn_id', 'payment_txn_success']


In [113]:
# List of columns to drop
columns_to_drop = ["product_name", "product_category","order_id", "customer_id", "customer_name", "product_id", "payment_type", "payment_txn_id", "payment_txn_success"]

# Drop the columns
df_dropped = df.drop(*columns_to_drop)

# Show the DataFrame after dropping the columns
df_dropped.show()


+---+------+----------+--------------------+----------------+----------------------+
|qty| price|  datetime|             country|            city|ecommerce_website_name|
+---+------+----------+--------------------+----------------+----------------------+
|  3| 25.19|2022-05-19|       United States|       Blackhawk|       www.wayfair.com|
|  4| 59.99|2023-06-15|       United States|   Lake Barcroft|       www.walmart.com|
|  6| 17.99|2023-09-11|                Iraq|           Dahūk|        www.costco.com|
|  1| 13.19|2022-09-25|South Georgia And...|       Grytviken|        www.amazon.com|
|  9|  8.39|2024-06-15|                Mali|        Yélimané|         www.apple.com|
|  7| 35.94|2022-02-14|             Ukraine|           Odesa|       www.wayfair.com|
|  5|119.99|2023-06-08|           Argentina|            Vera|        www.amazon.com|
|  5| 114.0|2024-03-09|               China|          Guilin|         www.apple.com|
|  4| 59.99|2024-05-08|       United States|       Warrenton|    

In [114]:
save_dataframe_to_csv(df_dropped, "dropped_data.csv")

                                                                                

In [57]:
def count_unique_values(df):
    unique_counts = {}
    for column in df.columns:
        unique_count = df.select(countDistinct(column)).collect()[0][0]
        unique_counts[column] = unique_count
    return unique_counts

# Example usage
unique_counts = count_unique_values(df_dropped)
print(unique_counts)


{'product_name': 980, 'product_category': 19, 'qty': 12, 'price': 366, 'datetime': 945, 'country': 37, 'city': 100, 'ecommerce_website_name': 11}


In [58]:
df = df_dropped

In [81]:
# Extract year and quarter, calculate sales
quarterly_sales_df = df.withColumn("year", year(col("datetime"))) \
                       .withColumn("quarter", quarter(col("datetime"))) \
                       .withColumn("sales", col("qty") * col("price")) \
                       .groupBy("ecommerce_website_name", "year", "quarter") \
                       .agg(round(sum(col("sales")), 2).alias("total_sales")) \
                       .withColumn("year_quarter", concat(col("year"), lit(" Q"), col("quarter")))

# Pivot the DataFrame
pivot_df = quarterly_sales_df.groupBy("ecommerce_website_name") \
                             .pivot("year_quarter") \
                             .sum("total_sales")

# Format the numbers with comma separators and two decimal places
formatted_columns = [format_number(col(c), 2).alias(c) if c != 'ecommerce_website_name' else col(c) for c in pivot_df.columns]

# Select formatted columns
formatted_pivot_df = pivot_df.select(formatted_columns)

# Show the results
formatted_pivot_df.show(truncate=False)


+----------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|ecommerce_website_name|2022 Q1  |2022 Q2  |2022 Q3  |2022 Q4  |2023 Q1  |2023 Q2  |2023 Q3  |2023 Q4  |2024 Q1  |2024 Q2  |2024 Q3  |
+----------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|www.walmart.com       |19,873.65|26,417.65|32,692.85|30,020.28|25,361.03|67,240.57|40,993.97|28,433.25|29,353.72|42,140.80|14,581.66|
|www.nike.com          |57,601.74|26,094.88|49,790.46|47,400.69|38,607.34|27,308.98|81,595.00|42,850.02|18,067.20|23,389.42|5,281.60 |
|www.homedepot.com     |23,897.50|40,668.50|34,091.06|28,079.34|36,625.58|53,671.29|42,409.07|30,619.20|22,159.41|38,457.30|27,324.69|
|www.wayfair.com       |34,335.14|28,737.72|27,653.42|24,764.22|34,268.25|47,537.20|30,278.85|36,789.36|42,818.78|30,800.59|15,491.98|
|www.etsy.com          |28,868.99|59,734.72|40,969.01|4

In [100]:
# Calculate total sales across all quarters per ecommerce_website_name
total_sales_df = quarterly_sales_df.groupBy("ecommerce_website_name") \
                                   .agg(round(sum(col("total_sales")), 2).alias("total_sales"))

total_sales_df.show(truncate=False)

+----------------------+-----------+
|ecommerce_website_name|total_sales|
+----------------------+-----------+
|www.walmart.com       |357109.43  |
|www.nike.com          |417987.33  |
|www.homedepot.com     |378002.94  |
|www.wayfair.com       |353475.51  |
|www.etsy.com          |378809.01  |
|www.costco.com        |397110.68  |
|www.bestbuy.com       |375471.15  |
|www.amazon.com        |436876.78  |
|www.apple.com         |339249.97  |
|www.alibaba.com       |382412.45  |
|www.ebay.com          |362723.04  |
+----------------------+-----------+



In [105]:
# Extract year and quarter, calculate sales
quarterly_sales_df = df.withColumn("quarter", quarter(col("datetime"))) \
                       .withColumn("sales", col("qty") * col("price")) \
                       .groupBy("ecommerce_website_name", "quarter") \
                       .agg(round(sum(col("sales")), 2).alias("total_sales")) \
                       .withColumn("quarter", concat(lit(" Q"), col("quarter")))

total_revenue_by_site = quarterly_sales_df.groupBy("ecommerce_website_name") \
                                          .agg(round(sum(col("total_sales")), 2).alias("total_sales"))

# Pivot the DataFrame
pivot_df = quarterly_sales_df.groupBy("ecommerce_website_name") \
                             .pivot("quarter") \
                             .sum("total_sales")

pivot_df = pivot_df.join(total_revenue_by_site, "ecommerce_website_name")\
.orderBy(col("total_sales").desc())

# Format the numbers with comma separators and two decimal places
formatted_columns = [format_number(col(c), 2).alias(c) if c != 'ecommerce_website_name' else col(c) for c in pivot_df.columns]

# Select formatted columns
formatted_pivot_df = pivot_df.select(formatted_columns)

# Show the results
formatted_pivot_df.show(truncate=False)


+----------------------+----------+----------+----------+----------+-----------+
|ecommerce_website_name| Q1       | Q2       | Q3       | Q4       |total_sales|
+----------------------+----------+----------+----------+----------+-----------+
|www.amazon.com        |88,051.30 |146,156.34|97,716.83 |104,952.31|436,876.78 |
|www.nike.com          |114,276.28|76,793.28 |136,667.06|90,250.71 |417,987.33 |
|www.costco.com        |115,659.28|96,752.08 |119,330.95|65,368.37 |397,110.68 |
|www.alibaba.com       |131,761.53|92,888.21 |76,429.18 |81,333.53 |382,412.45 |
|www.etsy.com          |87,974.44 |121,018.05|97,425.11 |72,391.41 |378,809.01 |
|www.homedepot.com     |82,682.49 |132,797.09|103,824.82|58,698.54 |378,002.94 |
|www.bestbuy.com       |105,689.85|107,282.60|85,700.22 |76,798.48 |375,471.15 |
|www.ebay.com          |107,166.56|123,936.20|65,212.54 |66,407.74 |362,723.04 |
|www.walmart.com       |74,588.40 |135,799.02|88,268.48 |58,453.53 |357,109.43 |
|www.wayfair.com       |111,

In [88]:
# Calculate revenue as qty * price
df = df.withColumn("revenue", col("qty") * col("price"))

# Group by country and ecommerce_website_name, then sum the revenue
grouped_df = df.groupBy("country", "ecommerce_website_name") \
               .agg(round(sum("revenue"), 2).alias("total_revenue"))

# Pivot the DataFrame
pivot_df = grouped_df.groupBy("country") \
                     .pivot("ecommerce_website_name") \
                     .sum("total_revenue")

# Calculate the total revenue per country for ordering
total_revenue_per_country = grouped_df.groupBy("country") \
                                      .agg(sum("total_revenue").alias("country_total_revenue"))

# Join the pivoted DataFrame with the total revenue DataFrame
pivot_df = pivot_df.join(total_revenue_per_country, on="country") \
                   .orderBy(col("country_total_revenue").desc())

# Drop the temporary total revenue column used for ordering
pivot_df = pivot_df.drop("country_total_revenue")

# Show the results
pivot_df.show(truncate=False)


+----------------------------------------+---------------+--------------+-------------+---------------+--------------+------------+------------+-----------------+------------+---------------+---------------+
|country                                 |www.alibaba.com|www.amazon.com|www.apple.com|www.bestbuy.com|www.costco.com|www.ebay.com|www.etsy.com|www.homedepot.com|www.nike.com|www.walmart.com|www.wayfair.com|
+----------------------------------------+---------------+--------------+-------------+---------------+--------------+------------+------------+-----------------+------------+---------------+---------------+
|United States                           |154898.15      |243406.8      |166481.07    |194378.43      |198490.16     |172194.69   |199031.58   |201572.17        |205269.08   |138144.46      |166859.34      |
|Russia                                  |30229.67       |18960.3       |15727.06     |32633.11       |36626.24      |17400.29    |35232.08    |14988.66         |40076.

In [95]:
# Extract year and quarter, calculate sales
quarterly_sales_df = df.withColumn("quarter", quarter(col("datetime"))) \
                       .withColumn("sales", col("qty") * col("price")) \
                       .groupBy("country", "quarter") \
                       .agg(round(sum(col("sales")), 2).alias("total_sales")) \
                       .withColumn("quarter", concat(lit(" Q"), col("quarter")))

# Pivot the DataFrame
pivot_df = quarterly_sales_df.groupBy("country") \
                             .pivot("quarter") \
                             .sum("total_sales")
                             
# Join the pivoted DataFrame with the total revenue DataFrame
pivot_df = pivot_df.join(total_revenue_per_country, on="country") \
                   .orderBy(col("country_total_revenue").desc())

# Format the numbers with comma separators and two decimal places
formatted_columns = [format_number(col(c), 2).alias(c) if c != 'country' else col(c) for c in pivot_df.columns]

# Select formatted columns
formatted_pivot_df = pivot_df.select(formatted_columns)

# Show the results
formatted_pivot_df.show(truncate=False)

                                                                                

+----------------------------------------+----------+----------+----------+----------+---------------------+
|country                                 | Q1       | Q2       | Q3       | Q4       |country_total_revenue|
+----------------------------------------+----------+----------+----------+----------+---------------------+
|United States                           |522,918.86|576,777.97|505,373.07|435,656.03|2,040,725.93         |
|Russia                                  |87,332.85 |109,532.20|49,724.33 |57,988.88 |304,578.26           |
|China                                   |67,819.16 |42,749.93 |31,676.60 |24,580.58 |166,826.27           |
|Iran                                    |32,664.39 |50,275.65 |9,662.61  |17,833.01 |110,435.66           |
|Japan                                   |22,318.61 |21,069.70 |27,911.08 |17,574.33 |88,873.72            |
|Ukraine                                 |20,490.51 |28,162.34 |14,866.49 |16,244.67 |79,764.01            |
|Mexico            

In [108]:
# Extract year and quarter, calculate sales
quarterly_sales_df = df.withColumn("year", year(col("datetime"))) \
                       .withColumn("sales", col("qty") * col("price")) \
                       .groupBy("country", "year") \
                       .agg(round(sum(col("sales")), 2).alias("total_sales"))                        

# Pivot the DataFrame
pivot_df = quarterly_sales_df.groupBy("country") \
                             .pivot("year") \
                             .sum("total_sales")
                             
# Join the pivoted DataFrame with the total revenue DataFrame
pivot_df = pivot_df.join(total_revenue_per_country, on="country") \
                   .orderBy(col("country_total_revenue").desc())

# Format the numbers with comma separators and two decimal places
formatted_columns = [format_number(col(c), 2).alias(c) if c != 'country' else col(c) for c in pivot_df.columns]

# Select formatted columns
formatted_pivot_df = pivot_df.select(formatted_columns)

# Show the results
formatted_pivot_df.show(truncate=False)

+----------------------------------------+----------+----------+----------+---------------------+
|country                                 |2022      |2023      |2024      |country_total_revenue|
+----------------------------------------+----------+----------+----------+---------------------+
|United States                           |726,915.08|852,850.05|460,960.80|2,040,725.93         |
|Russia                                  |138,281.83|117,301.74|48,994.69 |304,578.26           |
|China                                   |65,243.68 |63,639.97 |37,942.62 |166,826.27           |
|Iran                                    |48,687.30 |26,738.34 |35,010.02 |110,435.66           |
|Japan                                   |46,614.42 |29,969.98 |12,289.32 |88,873.72            |
|Ukraine                                 |28,604.17 |42,445.68 |8,714.16  |79,764.01            |
|Mexico                                  |23,811.64 |35,499.77 |20,339.58 |79,650.99            |
|Kenya              

In [110]:
from pyspark.sql.functions import col, quarter, year, sum, round, lit, concat, format_number

# Extract year and quarter, calculate sales
quarterly_sales_df = df.withColumn("year", year(col("datetime"))) \
                       .withColumn("quarter", quarter(col("datetime"))) \
                       .withColumn("sales", col("qty") * col("price")) \
                       .groupBy("country", "year", "quarter") \
                       .agg(round(sum(col("sales")), 2).alias("total_sales")) \
                       .withColumn("year_quarter", concat(col("year"), lit(" Q"), col("quarter")))

# Calculate total sales across all quarters and years per country
total_sales_df = quarterly_sales_df.groupBy("country") \
                                   .agg(round(sum(col("total_sales")), 2).alias("country_total_revenue"))

# Join the total sales DataFrame with the quarterly sales DataFrame
quarterly_sales_with_total_df = quarterly_sales_df.join(total_sales_df, on="country")

# Pivot the DataFrame to show quarters by year
pivot_df = quarterly_sales_with_total_df.groupBy("country") \
                                        .pivot("year_quarter") \
                                        .sum("total_sales")

# Add total sales column to the pivoted DataFrame
pivot_df = pivot_df.join(total_sales_df, on="country") \
                   .orderBy(col("country_total_revenue").desc())

# Format the numbers with comma separators and two decimal places
formatted_columns = [format_number(col(c), 2).alias(c) if c != 'country' else col(c) for c in pivot_df.columns]

# Select formatted columns
formatted_pivot_df = pivot_df.select(formatted_columns)

# Show the results
formatted_pivot_df.show(truncate=False)


[Stage 978:>                                                        (0 + 1) / 1]

+----------------------------------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------------------+
|country                                 |2022 Q1   |2022 Q2   |2022 Q3   |2022 Q4   |2023 Q1   |2023 Q2   |2023 Q3   |2023 Q4   |2024 Q1   |2024 Q2   |2024 Q3  |country_total_revenue|
+----------------------------------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------------------+
|United States                           |142,602.60|194,614.77|166,370.03|223,327.68|184,481.98|193,579.26|262,460.46|212,328.35|195,834.28|188,583.94|76,542.58|2,040,725.93         |
|Russia                                  |36,935.89 |44,649.44 |17,704.15 |38,992.35 |23,470.39 |46,986.49 |27,848.33 |18,996.53 |26,926.57 |17,896.27 |4,171.85 |304,578.26           |
|China                                   |24,944.98 |12,299.11 |13,495.18 |

                                                                                