In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Bucketing").getOrCreate()

spark

In [35]:
dftrue = spark.read.csv("superstore.csv", header = True, inferSchema = True)
dftrue.show(5)
dftrue.printSchema()

+---------------+-----------+-------------+-----------+----------------+--------+------+------+-------------------+--------------+--------------+---------------+--------------------+-------+--------+------+------+-----+--------+--------------------+--------------+-------------+----------+------------+----+-------------+-------+
|       Category|       City|      Country|Customer.ID|   Customer.Name|Discount|Market|记录数|         Order.Date|      Order.ID|Order.Priority|     Product.ID|        Product.Name| Profit|Quantity|Region|Row.ID|Sales| Segment|           Ship.Date|     Ship.Mode|Shipping.Cost|     State|Sub.Category|Year|      Market2|weeknum|
+---------------+-----------+-------------+-----------+----------------+--------+------+------+-------------------+--------------+--------------+---------------+--------------------+-------+--------+------+------+-----+--------+--------------------+--------------+-------------+----------+------------+----+-------------+-------+
|Office Suppl

In [4]:
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row.ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (null

In [12]:
from pyspark.sql.functions import col, regexp_replace
# id_columns = [col_name for col_name in df.columns if col_name.endswith("ID")]

# for old_name in id_columns:
#     new_name = old_name.replace(".", "_")
#     df = df.withColumn(new_name, col(old_name).cast("integer"))
#     df = df.drop(old_name)

# df.printSchema()
# df.show(5, truncate=False)

id_columns = [col_name for col_name in df.columns if col_name.endswith("ID")]

# Step 3: Rename columns (replace "." with "_") and cast to integer
for old_name in id_columns:
    new_name = old_name.replace(".", "_")  # Replace "." with "_"
    
    # Use backticks to escape column names with dots
    df = df.withColumn(new_name, regexp_replace(col(f"`{old_name}`"), "[^0-9]", "").cast("integer"))
    # Alternative: If IDs are already numeric strings (e.g., "123"), use:
    # df = df.withColumn(new_name, col(f"`{old_name}`").cast("integer"))
    
    df = df.drop(f"`{old_name}`")  # Drop the original column with escaped name

# Step 4: Verify the new schema and data
df.printSchema()
# df.show(5, truncate=False)

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row.ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (null

In [13]:
id_columns = [col_name for col_name in df.columns if col_name.endswith("ID") and "." in col_name]

# Step 3: Rename columns (replace "." with "_"), cast to integer, and drop original columns
for old_name in id_columns:
    # new_name = old_name.replace(".", "_")  # Replace "." with "_"
    
    # # Create new column with cleaned and casted values
    # df = df.withColumn(new_name, regexp_replace(col(f"`{old_name}`"), "[^0-9]", "").cast("integer"))
    
    # Drop the original column
    df = df.drop(f"`{old_name}`")

df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row.ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (null

In [16]:
df.select("Customer_ID","Order_ID","Product_ID", "Row_ID").distinct().show(5)

+-----------+----------+----------+------+
|Customer_ID|  Order_ID|Product_ID|Row_ID|
+-----------+----------+----------+------+
|     100754|2013144015|  10002870| 39985|
|     127904|2012149972|  10002763| 34083|
|     111254|2013123050|  10003455| 39525|
|     100904|2011156160|  10003876| 40954|
|     106154|2014157987|  10004582| 33251|
+-----------+----------+----------+------+
only showing top 5 rows



In [17]:
df.drop("Order.ID")

DataFrame[Category: string, City: string, Country: string, Customer.ID: string, Customer.Name: string, Discount: double, Market: string, 记录数: int, Order.Date: timestamp, Order.Priority: string, Product.ID: string, Product.Name: string, Profit: string, Quantity: string, Region: string, Row.ID: string, Sales: string, Segment: string, Ship.Date: string, Ship.Mode: string, Shipping.Cost: string, State: string, Sub.Category: string, Year: string, Market2: string, weeknum: string, Customer_ID: int, Order_ID: int, Product_ID: int, Row_ID: int]

In [18]:
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row.ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (null

In [19]:
df1 = df.drop("Row.ID")
df1.show()

+---------------+-----------+-------------+-----------+------------------+--------+------+------+-------------------+--------------+--------------+---------------+--------------------+-------+--------+------+-----+--------+--------------------+--------------+-------------+----------+------------+----+-------------+-------+-----------+----------+----------+------+
|       Category|       City|      Country|Customer.ID|     Customer.Name|Discount|Market|记录数|         Order.Date|      Order.ID|Order.Priority|     Product.ID|        Product.Name| Profit|Quantity|Region|Sales| Segment|           Ship.Date|     Ship.Mode|Shipping.Cost|     State|Sub.Category|Year|      Market2|weeknum|Customer_ID|  Order_ID|Product_ID|Row_ID|
+---------------+-----------+-------------+-----------+------------------+--------+------+------+-------------------+--------------+--------------+---------------+--------------------+-------+--------+------+-----+--------+--------------------+--------------+------------

df1.printSchema()

In [20]:
df1.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Market2: string (nul

In [28]:
id_columns = [col_name for col_name in df.columns if col_name.endswith("ID") and "." in col_name]

# Step 3: Rename columns (replace "." with "_"), cast to integer, and drop original columns
for old_name in id_columns:
    # new_name = old_name.replace(".", "_")  # Replace "." with "_"
    
    # # Create new column with cleaned and casted values
    # df = df.withColumn(new_name, regexp_replace(col(f"`{old_name}`"), "[^0-9]", "").cast("integer"))
    
    # Drop the original column
    
    df3 = df2.drop(f"`{old_name}`")

df3.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Market2: string (nul

In [30]:
# Debug - Print initial columns
print("Initial columns:", df.columns)

# Dynamically detect columns ending with "ID" and containing a dot
id_columns = [col_name for col_name in df.columns if col_name.endswith("ID") and "." in col_name]
print("Detected ID columns with dots:", id_columns)

# Rename columns (replace "." with "_"), cast to integer, and drop original columns
for old_name in id_columns:
    new_name = old_name.replace(".", "_")
    print(f"Processing: {old_name} -> {new_name}")
    df= df.withColumn(new_name, regexp_replace(col(f"`{old_name}`"), "[^0-9]", "").cast("integer"))
    df4= df.drop(f"`{old_name}`")
    print(f"Dropped: {old_name}")

# Debug - Print final columns
print("Final columns:", df.columns)

# Verify the new schema and data
df4.printSchema()
df4.show(5, truncate=False)

Initial columns: ['Category', 'City', 'Country', 'Customer.ID', 'Customer.Name', 'Discount', 'Market', '记录数', 'Order.Date', 'Order.ID', 'Order.Priority', 'Product.ID', 'Product.Name', 'Profit', 'Quantity', 'Region', 'Row.ID', 'Sales', 'Segment', 'Ship.Date', 'Ship.Mode', 'Shipping.Cost', 'State', 'Sub.Category', 'Year', 'Market2', 'weeknum', 'Customer_ID', 'Order_ID', 'Product_ID', 'Row_ID']
Detected ID columns with dots: ['Customer.ID', 'Order.ID', 'Product.ID', 'Row.ID']
Processing: Customer.ID -> Customer_ID
Dropped: Customer.ID
Processing: Order.ID -> Order_ID
Dropped: Order.ID
Processing: Product.ID -> Product_ID
Dropped: Product.ID
Processing: Row.ID -> Row_ID
Dropped: Row.ID
Final columns: ['Category', 'City', 'Country', 'Customer.ID', 'Customer.Name', 'Discount', 'Market', '记录数', 'Order.Date', 'Order.ID', 'Order.Priority', 'Product.ID', 'Product.Name', 'Profit', 'Quantity', 'Region', 'Row.ID', 'Sales', 'Segment', 'Ship.Date', 'Ship.Mode', 'Shipping.Cost', 'State', 'Sub.Category

In [31]:
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row.ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (null

In [45]:
id_columns = [col_name for col_name in dfff12.columns if col_name.endswith("ID") and "." in col_name]
print(type(id_columns))
print(id_columns)

<class 'list'>
['Customer.ID', 'Order.ID']


In [46]:
for new_col in id_columns:
    dfff12 = dftrue.drop(f"{new_col}")
    print(f"{new_col}")

dfff12.printSchema()
dftrue.printSchema()

Customer.ID
Order.ID
root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Market2: string (nullable = true)
 |-- weeknum: string (nullable = true)

root
 |

In [48]:
dfff1 = dfff1.drop(*id_columns)

dfff1.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Market2: string (nullable = true)
 |-- weeknum: string (nullable = true)



In [49]:
dfff1 = dfff1.withColumnRenamed("Product.ID", "Product_ID")
dfff1.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Market2: string (nullable = true)
 |-- weeknum: string (nullable = true)



In [50]:
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row.ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (null

In [53]:
dot_column = [new1column for new1column in df.columns if new1column.endswith("ID") and "." in new1column]
print(dot_column)

# id_columns = [col_name for col_name in dfff12.columns if col_name.endswith("ID") and "." in col_name]


['Customer.ID', 'Order.ID', 'Product.ID', 'Row.ID']


In [54]:
for new1 in df.columns:
    print(new1)

Category
City
Country
Customer.ID
Customer.Name
Discount
Market
记录数
Order.Date
Order.ID
Order.Priority
Product.ID
Product.Name
Profit
Quantity
Region
Row.ID
Sales
Segment
Ship.Date
Ship.Mode
Shipping.Cost
State
Sub.Category
Year
Market2
weeknum
Customer_ID
Order_ID
Product_ID
Row_ID


In [57]:
for new2 in df.columns:
    df = new2.replace(".", "_")

In [58]:
df.printSchema()

AttributeError: 'str' object has no attribute 'printSchema'

In [59]:
print(df)

Row_ID


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Optimization").getOrCreate()

df = spark.read.csv("SuperStoreOrders.csv", header = True, inferSchema=True)

filtered_df = df.filter(df.year == 2014)
filtered_df = filtered_df.filter(col("sales") > 500)

# Create a small lookup table (region to manager)
lookup_data = [
    ("North", "Alice"),
    ("South", "Bob"),
    ("Central", "Charlie"),
    ("West", "David"),
    ("Africa", "Eve"),
    ("Oceania", "Frank"),
    ("Southeast Asia", "Grace"),
    ("North Asia", "Hannah"),
    ("Central Asia", "Ian"),
    ("EMEA", "Jack")
]
lookup_df = spark.createDataFrame(lookup_data, ["region", "manager"])

# Join with lookup table
joined_df = filtered_df.join(lookup_df, "region")

# Group by region and category, calculate total sales and profit
result_df = joined_df.groupBy("region", "category").agg(
    {"sales": "sum", "profit": "sum"}
)

In [3]:
spark.stop()

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, broadcast
from pyspark.storagelevel import StorageLevel

# Initialize Spark session with tuned configurations
spark = SparkSession.builder.appName("OptimizedSuperStore").getOrCreate()
    # .config("spark.sql.shuffle.partitions", "50") \
    # .config("spark.executor.memory", "4g") \
    # .config("spark.executor.cores", "2") \
  

# Load dataset
df = spark.read.csv("SuperStoreOrders.csv", header=True, inferSchema=True)

# Combine filters and cache filtered DataFrame
filtered_df = df.filter((col("year") == 2014) & (col("sales") > 500)).cache()

# Repartition to balance data before join
filtered_df = filtered_df.repartition(50, "region")

# Create a small lookup table (region to manager)
lookup_data = [
    ("North", "Alice"),
    ("South", "Bob"),
    ("Central", "Charlie"),
    ("West", "David"),
    ("Africa", "Eve"),
    ("Oceania", "Frank"),
    ("Southeast Asia", "Grace"),
    ("North Asia", "Hannah"),
    ("Central Asia", "Ian"),
    ("EMEA", "Jack")
]
lookup_df = spark.createDataFrame(lookup_data, ["region", "manager"])

# Broadcast join with lookup table
joined_df = filtered_df.join(broadcast(lookup_df), "region")

# Group by region and category with bucketing to avoid shuffle
joined_df.write.bucketBy(50, "region").saveAsTable("bucketed_sales")
bucketed_df = spark.table("bucketed_sales")
result_df = bucketed_df.groupBy("region", "category").agg(
    {"sales": "sum", "profit": "sum"}
)

# Write results to CSV
# result_df.coalesce(1).write.csv("output_optimized.csv")

# Unpersist cached data
filtered_df.unpersist()

DataFrame[order_id: string, order_date: string, ship_date: string, ship_mode: string, customer_name: string, segment: string, state: string, country: string, market: string, region: string, product_id: string, category: string, sub_category: string, product_name: string, sales: string, quantity: string, discount: string, profit: double, shipping_cost: double, order_priority: string, year: string]

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("dfPractice").getOrCreate()

spark

In [2]:
df = spark.read.csv("superstore.csv", inferSchema=True, header=True)
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer.ID: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.ID: string (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.ID: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Row.ID: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (null

In [3]:
id_columns=[colName for colName in df.columns if colName.endswith("ID") and "." in colName]
print(id_columns, type(id_columns))


['Customer.ID', 'Order.ID', 'Product.ID', 'Row.ID'] <class 'list'>


In [12]:
from pyspark.sql.functions import col
# for old_name in id_columns:
#     # new_name = old_name.replace(".", "_")  # Replace "." with "_"
    
#     # # Create new column with cleaned and casted values
#     # df = df.withColumn(new_name, regexp_replace(col(f"`{old_name}`"), "[^0-9]", "").cast("integer"))
    
#     # Drop the original column
    
#     df3 = df2.drop(f"`{old_name}`")

for oldColumn in id_columns:
    new_name = oldColumn.replace(".", "_")
    print("NEW", new_name, oldColumn)
    df = df.withColumn(new_name, col(f"`{oldColumn}`").cast("integer"))
    print("NEWWW", new_name, oldColumn)
    df = df.drop(oldColumn)

NEW Customer_ID Customer.ID
NEWWW Customer_ID Customer.ID
NEW Order_ID Order.ID
NEWWW Order_ID Order.ID
NEW Product_ID Product.ID
NEWWW Product_ID Product.ID
NEW Row_ID Row.ID
NEWWW Row_ID Row.ID


In [13]:
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Customer.Name: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Market: string (nullable = true)
 |-- 记录数: integer (nullable = true)
 |-- Order.Date: timestamp (nullable = true)
 |-- Order.Priority: string (nullable = true)
 |-- Product.Name: string (nullable = true)
 |-- Profit: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Ship.Date: string (nullable = true)
 |-- Ship.Mode: string (nullable = true)
 |-- Shipping.Cost: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Sub.Category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Market2: string (nullable = true)
 |-- weeknum: string (nullable = true)
 |-- Customer_ID: integer (nullable = true)
 |-- Order_ID: integer (nullable = true)
 |-- Product_ID: integer (

In [8]:
a=df["Country"]
df= df.drop(a)
print(df)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Country` cannot be resolved. Did you mean one of the following? [`Category`, `City`, `Customer`.`ID`, `Customer`.`Name`, `Discount`, `Market`, `记录数`, `Order`.`Date`, `Order`.`ID`, `Order`.`Priority`, `Product`.`ID`, `Product`.`Name`, `Profit`, `Quantity`, `Region`, `Row`.`ID`, `Sales`, `Segment`, `Ship`.`Date`, `Ship`.`Mode`, `Shipping`.`Cost`, `State`, `Sub`.`Category`, `Year`, `Market2`, `weeknum`].