In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("clean").getOrCreate()

In [2]:
spark

In [4]:
df = spark.read.csv("superstore.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row.ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (null

In [7]:
def is_english(colName):
    return all(ord(char) < 128 for char in colName)

colNameClean = [validCol for validCol in df.columns if not is_english(validCol) or "." in validCol]
print(colNameClean)

['Customer.ID', 'Customer.Name', '记录数', 'Order.Date', 'Order.ID', 'Order.Priority', 'Product.ID', 'Product.Name', 'Row.ID', 'Ship.Date', 'Ship.Mode', 'Shipping.Cost', 'Sub.Category']


In [8]:
for validCol in df.columns:
    new_col = validCol.replace("." ,"_")
    df = df.withColumnRenamed(validCol,new_col)
    
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order_Date: timestamp (nullable = true)
 |-- Order_ID: string (nullable = true)
 |-- Order_Priority: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row_ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship_Date: string (nullable = true)
 |-- Ship_Mode: string (nullable = true)
 |-- Shipping_Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub_Category: string (nullable = true)
 |-- Year: string (null

In [9]:
non_english_columns = [col for col in df.columns if not is_english(col)]
df_cleaned = df.drop(*non_english_columns)

df_cleaned.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- Order_Date: timestamp (nullable = true)
 |-- Order_ID: string (nullable = true)
 |-- Order_Priority: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row_ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship_Date: string (nullable = true)
 |-- Ship_Mode: string (nullable = true)
 |-- Shipping_Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub_Category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Market2: string (n

In [10]:
df_cleaned.show(5)

+---------------+-----------+-------------+-----------+----------------+--------+------+-------------------+--------------+--------------+---------------+--------------------+-------+--------+------+------+-----+--------+--------------------+--------------+-------------+----------+------------+----+-------------+-------+
|       Category|       City|      Country|Customer_ID|   Customer_Name|Discount|Market|         Order_Date|      Order_ID|Order_Priority|     Product_ID|        Product_Name| Profit|Quantity|Region|Row_ID|Sales| Segment|           Ship_Date|     Ship_Mode|Shipping_Cost|     State|Sub_Category|Year|      Market2|weeknum|
+---------------+-----------+-------------+-----------+----------------+--------+------+-------------------+--------------+--------------+---------------+--------------------+-------+--------+------+------+-----+--------+--------------------+--------------+-------------+----------+------------+----+-------------+-------+
|Office Supplies|Los Angeles|Un

In [12]:
import os
from datetime import datetime
today = datetime.now().strftime("%Y-%m-%d")  # Format: 2025-05-13
output_folder = f"./{today}"
os.makedirs(output_folder, exist_ok=True)

# Save with partitionBy
df_cleaned.write.partitionBy("Country").csv(output_folder, header=True, mode="overwrite")

In [13]:
today = datetime.now().strftime("%Y-%m-%d")  # Format: 2025-05-13
output_folder = f"./{today}"
os.makedirs(output_folder, exist_ok=True)

# Save a single CSV file for each country
countries = [row["Country"] for row in df_cleaned.select("Country").distinct().collect()]
for country in countries:
    df_country = df_cleaned.filter(df_cleaned.Country == country).coalesce(1)
    output_path = f"{output_folder}/{country.replace(' ', '_')}.csv"
    df_country.write.csv(output_path, header=True, mode="overwrite")