In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

scala_version = '2.12'
spark_version = '3.5.1'
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:3.7.0'
]

spark = SparkSession.builder.master("local").appName("Spark-Cleaning").config("spark.jars.packages", ",".join(packages)).getOrCreate()

In [2]:
spark

In [4]:
file_path = "../Crawl/amazon_data.csv"  # Update with the correct path
spark_df = spark.read.option("header", "true").csv(file_path)
spark_df.show()

+--------------------+-------+---------+--------------------+------------------+-------+--------------------+---------+-----------------------+------------------+--------------------+-----------------------+--------------------+------------------------+-------------------+---------------------+---------------------+--------------------+-----------------+-----+----------+------------------+----+----------+--------------+
|               title|  price|old_price|         product_url|            rating|reviews|           purchases|    Brand|Memory Storage Capacity|Hardware Interface|     Special Feature|Connectivity Technology|         Write Speed|Digital Storage Capacity|Hard Disk Interface|Hard Disk Form Factor|Hard Disk Description|  Compatible Devices|Installation Type|Color|Model Name|Product Dimensions|Size|Read Speed|Hard Disk Size|
+--------------------+-------+---------+--------------------+------------------+-------+--------------------+---------+-----------------------+---------

In [5]:
from pyspark.sql import functions as F

# `price` và `old_price` 
# Xóa dấu $ và đổi sang float cho tính toán, trong trường hợp old price không có thì sẽ điền = price
spark_df = spark_df.withColumn("price", F.regexp_replace("price", r'[\$,]', '').cast("float")) \
                   .withColumn("old_price", F.when(F.col("old_price").isNull(), F.col("price"))
                               .otherwise(F.regexp_replace("old_price", r'[\$,]', '').cast("float")))

# `rating`
# Thay đổi dạng rating
spark_df = spark_df.withColumn("rating", 
                               F.when(F.col("rating").isNotNull(), 
                                      F.regexp_extract("rating", r"(\d\.\d)", 1).cast("float"))
                               .otherwise(F.lit(0.0)))

In [6]:
spark_df.select('title', 'price', 'old_price', 'rating').show()

+--------------------+------+---------+-----------------+
|               title| price|old_price|           rating|
+--------------------+------+---------+-----------------+
|MFi Certified 256...| 49.99|    49.99|              0.0|
|THKAILAR USB C Fl...| 37.99|    37.99|4.300000190734863|
|SCICNCE 128GB Pho...| 27.99|    27.99|              0.0|
|Seagate Portable ...| 69.99|    69.99|4.699999809265137|
|SanDisk 2TB Extre...|159.99|   199.99|4.599999904632568|
|SAMSUNG 990 PRO S...|169.99|   249.99|              0.0|
|Seagate Storage E...|236.92|   249.99|4.599999904632568|
|Toshiba Canvio Ba...| 66.49|    69.99|4.599999904632568|
|Crucial P3 Plus 2...|113.95|   169.99|              0.0|
|WD_BLACK 2TB SN85...|149.99|   189.99|              0.0|
|SAMSUNG T7 Shield...|309.99|   499.99|4.699999809265137|
|SanDisk 128GB Ult...| 12.49|    19.99|4.699999809265137|
|Kingston Ironkey ...|134.99|   164.99|4.300000190734863|
|USB Flash Drive 5...| 42.99|    42.99|              0.0|
|10 Pack 64GB 

In [7]:
# `reviews`
# Đổi số lượng review thânhf số
spark_df = spark_df.withColumn("reviews", 
                               F.when(F.col("reviews").isNotNull(), 
                                      F.regexp_replace(F.col("reviews"), ",", "").cast("int"))
                               .otherwise(F.lit(0)))

# `purchases` 
# Lấy thông tin cần thiết từ chuỗi và biến đổi thành integer
spark_df = spark_df.withColumn("purchases",
                               F.when(F.col("purchases").rlike(r"(\d+K?)\+ bought in past month"),
                                      F.regexp_replace(F.regexp_extract(F.col("purchases"), r"(\d+K?)", 1), "K", "000")
                                      .cast("int"))
                               .otherwise(F.lit(0)))

In [9]:
spark_df.select('reviews', 'purchases').show()

+-------+---------+
|reviews|purchases|
+-------+---------+
|     13|        0|
|   4564|      200|
|    214|       50|
| 252699|    10000|
|  69135|    10000|
|   9681|    10000|
|  23949|     5000|
|  10602|     8000|
|  16995|     6000|
|  22032|     6000|
|  13461|     6000|
|  40986|    10000|
|    142|        0|
|    590|      100|
|   4239|        0|
|   5011|     5000|
|  33190|     6000|
|  40903|     5000|
|  18651|     9000|
| 120591|     7000|
+-------+---------+
only showing top 20 rows



In [10]:
# Các Brand chấp nhận trong trường hợp phải lấy từ Title
accepted_brands = ["SAMSUNG", "Seagate", "SanDisk", "Western Digital", "Crucial", "WD", 
                   "Kingston", "Amazon Basics", "WD_BLACK", "TOSHIBA"]

# Create `Brand` column: if the value is null, check `title` for accepted brands
spark_df = spark_df.withColumn("Brand",
                               F.when(F.col("Brand").isNull(),
                                       F.coalesce(*[F.when(F.col("title").contains(brand), brand) for brand in accepted_brands], F.lit("others")))
                               .otherwise(F.col("Brand"))
                              )

In [11]:
spark_df.select('Brand').show()

+---------------+
|          Brand|
+---------------+
|         others|
|         SUDEHO|
|       THKAILAR|
|        SCICNCE|
|        SanDisk|
|        SAMSUNG|
|        Seagate|
|         others|
|        Crucial|
|       WD_BLACK|
|        SAMSUNG|
|        SanDisk|
|       Kingston|
|        GNASEET|
|      FEBNISCTE|
|             WD|
|        SAMSUNG|
|         others|
|Western Digital|
|        Crucial|
+---------------+
only showing top 20 rows



In [12]:
spark_df.select('Memory Storage Capacity', 'Digital Storage Capacity').show()

+-----------------------+------------------------+
|Memory Storage Capacity|Digital Storage Capacity|
+-----------------------+------------------------+
|                   NULL|                    NULL|
|                 256 GB|                    NULL|
|                 512 GB|                    NULL|
|                 128 GB|                    NULL|
|                   NULL|                    NULL|
|                   NULL|                    NULL|
|                   NULL|                    NULL|
|                   NULL|                    NULL|
|                   NULL|                    NULL|
|                   NULL|                    NULL|
|                   NULL|                 2000 GB|
|                   NULL|                    NULL|
|                  64 GB|                    NULL|
|                 512 GB|                    NULL|
|                  64 GB|                    NULL|
|                   NULL|                    NULL|
|                   NULL|      

In [13]:
# Define patterns and conditions for Memory Storage Capacity and Digital Storage Capacity
usb_pattern = r"(?i)\bUSB\b"
ssd_patterns = r"(?i)\bSSD\b|\bSolid State Drive\b|\bPortable External Hard Drive\b|\bHDD\b"

# Extract storage capacity from `title` if the column is null and the title contains relevant keywords.
# This will fill `Memory Storage Capacity` for USB products and `Digital Storage Capacity` for SSD and other drives.
spark_df = (
    spark_df
    # Process `Memory Storage Capacity`
    .withColumn(
        "Memory Storage Capacity",
        F.when(
            F.col("Memory Storage Capacity").isNull() & F.col("title").rlike(usb_pattern),
            F.regexp_extract(F.col("title"), r"(\d+(?:\.\d+)?\s?(TB|GB|MB))", 0)
        ).otherwise(F.col("Memory Storage Capacity"))
    )
    # Process `Digital Storage Capacity`
    .withColumn(
        "Digital Storage Capacity",
        F.when(
            F.col("Digital Storage Capacity").isNull() & F.col("title").rlike(ssd_patterns),
            F.regexp_extract(F.col("title"), r"(\d+(?:\.\d+)?\s?(TB|GB|MB))", 0)
        ).otherwise(F.col("Digital Storage Capacity"))
    )
)

In [14]:
spark_df.select('Memory Storage Capacity', 'Digital Storage Capacity').show()

+-----------------------+------------------------+
|Memory Storage Capacity|Digital Storage Capacity|
+-----------------------+------------------------+
|                  256GB|                    NULL|
|                 256 GB|                    NULL|
|                 512 GB|                    NULL|
|                 128 GB|                     2TB|
|                    2TB|                     2TB|
|                   NULL|                  450 MB|
|                   NULL|                     2TB|
|                    2TB|                     2TB|
|                   NULL|                     2TB|
|                   NULL|                     2TB|
|                   NULL|                 2000 GB|
|                  128GB|                    NULL|
|                  64 GB|                    NULL|
|                 512 GB|                    NULL|
|                  64 GB|                    NULL|
|                   NULL|                     2TB|
|                    1TB|      

In [15]:
spark_df = spark_df.withColumn(
    "Memory Storage Capacity",
    F.regexp_replace(F.col("Memory Storage Capacity"), "\\s+", "")
).withColumn(
    "Digital Storage Capacity",
    F.regexp_replace(F.col("Digital Storage Capacity"), "\\s+", "")
)

# Fill nulls in both columns with '0'
spark_df = spark_df.fillna("0", subset=["Memory Storage Capacity", "Digital Storage Capacity"])

In [16]:
from pyspark.sql.functions import col, regexp_extract, when
unit_multiplier = {
    "TB": 1024,
    "GB": 1,
    "MB": 1/1024
}

# Thêm cột mới cho phần giá trị
spark_df = spark_df.withColumn("numeric_value", regexp_extract(col("Digital Storage Capacity"), r"(\d+\.?\d*)", 1).cast("float"))

# Thêm cột mới cho phần 
spark_df = spark_df.withColumn("unit", regexp_extract(col("Digital Storage Capacity"), r"([A-Za-z]+)", 1))

# Chuẩn hóa sang dạng GB
spark_df = spark_df.withColumn(
    "Digital Storage Capacity (GB)",
     when(col("numeric_value") == 0, 0)
    .when(col("unit") == "TB", col("numeric_value") * unit_multiplier["TB"])
    .when(col("unit") == "GB", col("numeric_value") * unit_multiplier["GB"])
    .when(col("unit") == "MB", col("numeric_value") * unit_multiplier["MB"])
    .otherwise(None) 
)

spark_df.select('Digital Storage Capacity', 'Digital Storage Capacity (GB)').show()

+------------------------+-----------------------------+
|Digital Storage Capacity|Digital Storage Capacity (GB)|
+------------------------+-----------------------------+
|                       0|                          0.0|
|                       0|                          0.0|
|                       0|                          0.0|
|                     2TB|                       2048.0|
|                     2TB|                       2048.0|
|                   450MB|                  0.439453125|
|                     2TB|                       2048.0|
|                     2TB|                       2048.0|
|                     2TB|                       2048.0|
|                     2TB|                       2048.0|
|                  2000GB|                       2000.0|
|                       0|                          0.0|
|                       0|                          0.0|
|                       0|                          0.0|
|                       0|     

In [17]:
# Tương tụ cho Memory Storage Capacity
spark_df = spark_df.withColumn("numeric_value_mem", regexp_extract(col("Memory Storage Capacity"), r"(\d+\.?\d*)", 1).cast("float"))
spark_df = spark_df.withColumn("unit_mem", regexp_extract(col("Memory Storage Capacity"), r"([A-Za-z]+)", 1))
spark_df = spark_df.withColumn(
    "Memory Storage Capacity (GB)",
    when(col("numeric_value_mem") == 0, 0)  # Keep zero values unchanged
    .when(col("unit_mem") == "TB", col("numeric_value_mem") * unit_multiplier["TB"])
    .when(col("unit_mem") == "GB", col("numeric_value_mem") * unit_multiplier["GB"])
    .when(col("unit_mem") == "MB", col("numeric_value_mem") * unit_multiplier["MB"])
    .otherwise(None)
)

# Bỏ các cột không cần thiết
spark_df = spark_df.drop("numeric_value", "unit", "numeric_value_mem", "unit_mem", "Digital Storage Capacity", "Memory Storage Capacity")

In [18]:
spark_df.select('Memory Storage Capacity (GB)', 'Digital Storage Capacity (GB)').show()

+----------------------------+-----------------------------+
|Memory Storage Capacity (GB)|Digital Storage Capacity (GB)|
+----------------------------+-----------------------------+
|                       256.0|                          0.0|
|                       256.0|                          0.0|
|                       512.0|                          0.0|
|                       128.0|                       2048.0|
|                      2048.0|                       2048.0|
|                         0.0|                  0.439453125|
|                         0.0|                       2048.0|
|                      2048.0|                       2048.0|
|                         0.0|                       2048.0|
|                         0.0|                       2048.0|
|                         0.0|                       2000.0|
|                       128.0|                          0.0|
|                        64.0|                          0.0|
|                       

In [19]:
# 'Hardware Interface
hardware_interfaces = ["USB 3.0", "USB", "USB 3.2 Gen 1", "USB 2.0", "USB 3.2 Gen 2"]

spark_df = spark_df.withColumn("temp_title", F.regexp_replace(F.col("title"), "[,]", ""))

hardware_interface_expr = F.coalesce(
    *[F.when(F.col("temp_title").contains(interface), interface) for interface in hardware_interfaces],
    F.lit("USB")  
)

spark_df = spark_df.withColumn(
    "Hardware Interface",
    F.when(F.col("Hardware Interface").isNull(), hardware_interface_expr)
    .otherwise(F.col("Hardware Interface"))
)

# Re-check for any misaligned values, re-run extraction or set to default if needed
spark_df = spark_df.withColumn(
    "Hardware Interface",
    F.when(
        # If Hardware Interface is still null or doesn’t match any known hardware interface
        (F.col("Hardware Interface").isNull()) |
        (~F.col("Hardware Interface").isin(hardware_interfaces)),
        # Then re-run extraction or set default
        F.coalesce(
            *[F.when(F.col("temp_title").contains(interface), interface) for interface in hardware_interfaces],
            F.lit("USB")
        )
    ).otherwise(F.col("Hardware Interface"))
)

In [20]:
spark_df.select('temp_title').show(truncate = False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|temp_title                                                                                                                                                                                          |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|MFi Certified 256GB Flash Drive for iPhone iPad Photo Stick for External Memory Storage Easily Back Up and Save Photos Videos 4 in 1 USB Memory Stick Thumb Drive for iPhone/iPad/Android/PC (Black)|
|THKAILAR USB C Flash Drive 512GB 2 in 1 OTG USB 3.1 Thumb Drive Memory Stick for Business Traveler Compatible with Android Phone/PC/Mac Pro External Storage Data (Red)                             |
|SCIC

In [21]:
spark_df.select('temp_title', 'Hardware Interface').show(truncate = True, n = 67)

+--------------------+------------------+
|          temp_title|Hardware Interface|
+--------------------+------------------+
|MFi Certified 256...|               USB|
|THKAILAR USB C Fl...|               USB|
|SCICNCE 128GB Pho...|           USB 3.0|
|Seagate Portable ...|           USB 3.0|
|SanDisk 2TB Extre...|               USB|
|SAMSUNG 990 PRO S...|               USB|
|Seagate Storage E...|               USB|
|Toshiba Canvio Ba...|           USB 3.0|
|Crucial P3 Plus 2...|               USB|
|WD_BLACK 2TB SN85...|               USB|
|SAMSUNG T7 Shield...|               USB|
|SanDisk 128GB Ult...|           USB 3.0|
|Kingston Ironkey ...|     USB 3.2 Gen 1|
|USB Flash Drive 5...|               USB|
|10 Pack 64GB Flas...|           USB 2.0|
|WD_BLACK 2TB SN85...|               USB|
|SAMSUNG T7 Portab...|               USB|
|Samsung 870 EVO S...|               USB|
|Western Digital W...|               USB|
|Crucial BX500 1TB...|               USB|
|Amazon Basics - 2...|            

In [22]:
spark_df = spark_df.drop("temp_title")

In [23]:
# Adjust the pattern to match speeds specifically in MB/s or similar units
write_speed_pattern = r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)(?=\s?(MB/s|Megabytes Per Second|MB per second))"

# Apply the pattern to extract write speed, remove commas, and convert to float
spark_df = spark_df.withColumn(
    "Write Speed",
    F.when(
        F.col("Write Speed").rlike(write_speed_pattern),
        F.regexp_replace(F.regexp_extract(F.col("Write Speed"), write_speed_pattern, 1), ",", "").cast("float")
    ).when(
        F.col("title").rlike(write_speed_pattern),
        F.regexp_replace(F.regexp_extract(F.col("title"), write_speed_pattern, 1), ",", "").cast("float")
    ).otherwise(F.lit(0))
)

In [24]:
spark_df.select('title', 'Write Speed').show(n = 67, truncate = True)

+--------------------+-----------+
|               title|Write Speed|
+--------------------+-----------+
|MFi Certified 256...|        0.0|
|THKAILAR USB C Fl...|        0.0|
|SCICNCE 128GB Pho...|       40.0|
|Seagate Portable ...|       20.0|
|SanDisk 2TB Extre...|       50.0|
|SAMSUNG 990 PRO S...|     7450.0|
|Seagate Storage E...|        0.0|
|Toshiba Canvio Ba...|        0.0|
|Crucial P3 Plus 2...|        0.0|
|WD_BLACK 2TB SN85...|     7300.0|
|SAMSUNG T7 Shield...|       50.0|
|SanDisk 128GB Ult...|        0.0|
|Kingston Ironkey ...|      115.0|
|USB Flash Drive 5...|       20.0|
|10 Pack 64GB Flas...|        8.0|
|WD_BLACK 2TB SN85...|     7300.0|
|SAMSUNG T7 Portab...|     1050.0|
|Samsung 870 EVO S...|        0.0|
|Western Digital W...|     5150.0|
|Crucial BX500 1TB...|      540.0|
|Amazon Basics - 2...|        0.0|
|SanDisk 128GB Ult...|        4.0|
|SanDisk 4TB Extre...|        0.0|
|SAMSUNG 990 EVO S...|     5000.0|
|WD 20TB Elements ...|        0.0|
|SAMSUNG T9 Portab..

In [25]:
color_pattern = r"(?i)(Black|White|Gray|Red|Blue|Green|Yellow|Pink|Silver|Gold)" # Thêm nếu cần

# Xử lý cho trường hợp đã có dữ liệu
accepted_colors = ["black", "white", "gray", "red", "blue", "green", "yellow", "pink", "silver", "gold"]

# `Color` 
spark_df = spark_df.withColumn(
    "Color",
    F.when(
        F.col("Color").isNull() & F.col("title").rlike(color_pattern),  # Lấy dữ liệu từ Title nếu Color NULL
        F.lower(F.regexp_extract(F.col("title"), color_pattern, 1))  # Chuẩn hóa lowercase
    ).when(
        F.col("Color").isNull(),  # Trong trường hợp Title không có, thế = "others"
        F.lit("others")
    ).when(
        ~F.col("Color").isin(accepted_colors),  # Nếu dữ liệu có sẵn không phù hợp pattern thì thế = "others"
        F.lit("others")
    ).otherwise(
        F.lower(F.col("Color"))  # Nếu hợp thì chuẩn hóa
    )
)

In [26]:
spark_df.select('title', 'Color').show(n = 67, truncate = True)

+--------------------+------+
|               title| Color|
+--------------------+------+
|MFi Certified 256...| black|
|THKAILAR USB C Fl...|   red|
|SCICNCE 128GB Pho...|  gray|
|Seagate Portable ...|others|
|SanDisk 2TB Extre...|others|
|SAMSUNG 990 PRO S...|others|
|Seagate Storage E...| black|
|Toshiba Canvio Ba...| black|
|Crucial P3 Plus 2...|others|
|WD_BLACK 2TB SN85...| black|
|SAMSUNG T7 Shield...|others|
|SanDisk 128GB Ult...| black|
|Kingston Ironkey ...|others|
|USB Flash Drive 5...|  blue|
|10 Pack 64GB Flas...|others|
|WD_BLACK 2TB SN85...| black|
|SAMSUNG T7 Portab...|  gray|
|Samsung 870 EVO S...|others|
|Western Digital W...| black|
|Crucial BX500 1TB...|others|
|Amazon Basics - 2...| black|
|SanDisk 128GB Ult...| black|
|SanDisk 4TB Extre...|others|
|SAMSUNG 990 EVO S...| black|
|WD 20TB Elements ...|others|
|SAMSUNG T9 Portab...| black|
|Samsung Type-C™ U...|  blue|
|Seagate BarraCuda...|others|
|LaCie Rugged Mini...|others|
|Kingston NV2 1TB ...|others|
|256GB Pho

In [27]:
spark_df.show()

+--------------------+------+---------+--------------------+-----------------+-------+---------+---------------+------------------+--------------------+-----------------------+-----------+-------------------+---------------------+---------------------+--------------------+-----------------+------+----------+------------------+----+----------+--------------+-----------------------------+----------------------------+
|               title| price|old_price|         product_url|           rating|reviews|purchases|          Brand|Hardware Interface|     Special Feature|Connectivity Technology|Write Speed|Hard Disk Interface|Hard Disk Form Factor|Hard Disk Description|  Compatible Devices|Installation Type| Color|Model Name|Product Dimensions|Size|Read Speed|Hard Disk Size|Digital Storage Capacity (GB)|Memory Storage Capacity (GB)|
+--------------------+------+---------+--------------------+-----------------+-------+---------+---------------+------------------+--------------------+----------

In [28]:
#from pyspark.sql.functions import to_json, struct

# Convert each row in DataFrame to a JSON string
#kafka_df = spark_df.select(to_json(struct(*spark_df.columns)).alias("value"))

# Kafka configurations
#kafka_bootstrap_servers = "localhost:9092"  
#kafka_topic = "BigData"         

#kafka_df.write \
#    .format("kafka") \
#    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \ #A
#    .option("topic", kafka_topic) \
#    .save()

In [29]:
output_path = "C:\\TEMPDATA"
spark_df.write.csv(output_path, 
                   mode="overwrite",
                   header=True)       

In [30]:
spark_df.show()

+--------------------+------+---------+--------------------+-----------------+-------+---------+---------------+------------------+--------------------+-----------------------+-----------+-------------------+---------------------+---------------------+--------------------+-----------------+------+----------+------------------+----+----------+--------------+-----------------------------+----------------------------+
|               title| price|old_price|         product_url|           rating|reviews|purchases|          Brand|Hardware Interface|     Special Feature|Connectivity Technology|Write Speed|Hard Disk Interface|Hard Disk Form Factor|Hard Disk Description|  Compatible Devices|Installation Type| Color|Model Name|Product Dimensions|Size|Read Speed|Hard Disk Size|Digital Storage Capacity (GB)|Memory Storage Capacity (GB)|
+--------------------+------+---------+--------------------+-----------------+-------+---------+---------------+------------------+--------------------+----------

In [31]:
from pyspark.sql.functions import col, min, max

# Các feature cần sử dụng MinMaxScaler
numerical_features = ["price", "old_price", "rating", "reviews", "purchases", "Write Speed", "Digital Storage Capacity (GB)", "Memory Storage Capacity (GB)"]

# Loop qua các features
for feature in numerical_features:
    # Tìm min và max của feature đó
    feature_min = spark_df.select(min(col(feature))).first()[0]
    feature_max = spark_df.select(max(col(feature))).first()[0]

    # Kiểm tra tránh trường hợp chia 0
    if feature_max - feature_min != 0:
        # Áp dụng thuật toán MinMax
        spark_df = spark_df.withColumn(
            feature,
            (col(feature) - feature_min) / (feature_max - feature_min)
        )
    else:
        # Trường hợp nếu toàn bộ giá trị đều như nhau
        spark_df = spark_df.withColumn(feature, F.lit(0))

In [32]:
spark_df.show(n = 67)

+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+---------+--------------------+------------------+---------------------+-----------------------+--------------------+-------------------+---------------------+---------------------+--------------------+-------------------+------+--------------------+--------------------+------+--------------------+--------------+-----------------------------+----------------------------+
|               title|               price|           old_price|         product_url|            rating|             reviews|purchases|               Brand|Hardware Interface|      Special Feature|Connectivity Technology|         Write Speed|Hard Disk Interface|Hard Disk Form Factor|Hard Disk Description|  Compatible Devices|  Installation Type| Color|          Model Name|  Product Dimensions|  Size|          Read Speed|Hard Disk Size|Digital Storage Capacity (GB)|Memory Storage Capacity (GB)|
+-

In [33]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, lit

# Function để xử lý StringIndex từ các features
def index_column(df, column_name):
    # Lấy các giá trị unique của feature
    unique_values_df = df.select(column_name).distinct()
    
    # Gán số thứ tự cho các giá trị đó
    indexed_values = unique_values_df.withColumn(
        f"{column_name} Index",
        row_number().over(Window.orderBy(lit(0))) - 1  
    )
    
    # Join dữ liệu lại bảng chính
    return df.join(indexed_values, on=column_name, how="left")

# "Color"
spark_df = index_column(spark_df, "Color")

# "Hardware Interface"
spark_df = index_column(spark_df, "Hardware Interface")

# "Brand"
spark_df = index_column(spark_df, "Brand")

In [34]:
spark_df.show(n = 67)

+--------------------+------------------+------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+---------+---------------------+-----------------------+--------------------+-------------------+---------------------+---------------------+--------------------+-------------------+--------------------+--------------------+------+--------------------+--------------+-----------------------------+----------------------------+-----------+------------------------+-----------+
|               Brand|Hardware Interface| Color|               title|               price|           old_price|         product_url|            rating|             reviews|purchases|      Special Feature|Connectivity Technology|         Write Speed|Hard Disk Interface|Hard Disk Form Factor|Hard Disk Description|  Compatible Devices|  Installation Type|          Model Name|  Product Dimensions|  Size|          Read Speed|Hard Disk Size|Digital Stora

In [35]:
spark_df.select('Color Index', 'Hardware Interface Index', 'Brand Index').show(n = 67, truncate = True)

+-----------+------------------------+-----------+
|Color Index|Hardware Interface Index|Brand Index|
+-----------+------------------------+-----------+
|          3|                       4|         11|
|          2|                       4|          5|
|          0|                       0|          0|
|          1|                       0|         12|
|          1|                       4|         20|
|          1|                       4|         21|
|          3|                       4|         22|
|          3|                       0|         11|
|          1|                       4|          3|
|          3|                       4|          7|
|          1|                       4|         21|
|          3|                       0|         20|
|          1|                       3|          8|
|          4|                       4|         10|
|          1|                       1|          9|
|          3|                       4|         18|
|          0|                  

In [36]:
spark_df.select('Brand', 'Brand Index').show(n = 67, truncate = True)

+--------------------+-----------+
|               Brand|Brand Index|
+--------------------+-----------+
|              others|         11|
|              SUDEHO|          5|
|            THKAILAR|          0|
|             SCICNCE|         12|
|             SanDisk|         20|
|             SAMSUNG|         21|
|             Seagate|         22|
|              others|         11|
|             Crucial|          3|
|            WD_BLACK|          7|
|             SAMSUNG|         21|
|             SanDisk|         20|
|            Kingston|          8|
|             GNASEET|         10|
|           FEBNISCTE|          9|
|                  WD|         18|
|             SAMSUNG|         21|
|              others|         11|
|     Western Digital|         17|
|             Crucial|          3|
|       Amazon Basics|          6|
|             SanDisk|         20|
|             SanDisk|         20|
|             SAMSUNG|         21|
|                  WD|         18|
|             SAMSUN

In [37]:
columns_to_drop = [
    "Special Feature",
    "Connectivity Technology",
    "Hard Disk Interface",
    "Installation Type",
    "Model Name",
    "Product Dimensions",
    "Size",
    "Read Speed",
    "Hard Disk Size",
    "Hard Disk Form Factor",
    "Hard Disk Description",
    "Compatible Devices",
    "product_url",
    "Brand",
    "Color",
    "Hardware Interface"
]

spark_df = spark_df.drop(*columns_to_drop)

In [38]:
spark_df = spark_df.na.drop()

In [39]:
from pyspark.sql.functions import col

spark_df = spark_df.withColumn("price", col("price").cast("double")) \
                   .withColumn("old_price", col("old_price").cast("double")) \
                   .withColumn("rating", col("rating").cast("double")) \
                   .withColumn("reviews", col("reviews").cast("double")) \
                   .withColumn("purchases", col("purchases").cast("double")) \
                   .withColumn("Write Speed", col("Write Speed").cast("double")) \
                   .withColumn("Digital Storage Capacity (GB)", col("Digital Storage Capacity (GB)").cast("double")) \
                   .withColumn("Memory Storage Capacity (GB)", col("Memory Storage Capacity (GB)").cast("double")) \
                   .withColumn("Color Index", col("Color Index").cast("double")) \
                   .withColumn("Hardware Interface Index", col("Hardware Interface Index").cast("double")) \
                   .withColumn("Brand Index", col("Brand Index").cast("double")) \

spark_df.printSchema()

root
 |-- title: string (nullable = true)
 |-- price: double (nullable = true)
 |-- old_price: double (nullable = true)
 |-- rating: double (nullable = true)
 |-- reviews: double (nullable = true)
 |-- purchases: double (nullable = true)
 |-- Write Speed: double (nullable = true)
 |-- Digital Storage Capacity (GB): double (nullable = true)
 |-- Memory Storage Capacity (GB): double (nullable = true)
 |-- Color Index: double (nullable = true)
 |-- Hardware Interface Index: double (nullable = true)
 |-- Brand Index: double (nullable = true)



In [40]:
output_path = "C:\\TEMPDATA\\DataFunction"
spark_df.write.csv(output_path, 
                   mode="overwrite",
                   header=True)  