In [1]:
# Install PySpark
!pip install pyspark




In [2]:
# Import and create Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Task02_Preprocessing") \
    .getOrCreate()


In [3]:
from google.colab import files
uploaded = files.upload()


Saving Online.csv to Online.csv


In [4]:
# Load CSV into Spark DataFrame
df = spark.read.csv("Online.csv", header=True, inferSchema=True)

# Preview data
df.show(5)
df.printSchema()


+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows

root
 |-- InvoiceNo: string (nullable = true)
 |

In [5]:
# Drop nulls in key columns
clean_df = df.dropna(subset=["Quantity", "UnitPrice", "Country"])


In [6]:
from pyspark.sql.functions import col

clean_df = clean_df.withColumn("TotalValue", col("Quantity") * col("UnitPrice"))
clean_df.select("Quantity", "UnitPrice", "TotalValue").show(5)


+--------+---------+------------------+
|Quantity|UnitPrice|        TotalValue|
+--------+---------+------------------+
|       6|     2.55|15.299999999999999|
|       6|     3.39|             20.34|
|       8|     2.75|              22.0|
|       6|     3.39|             20.34|
|       6|     3.39|             20.34|
+--------+---------+------------------+
only showing top 5 rows



In [7]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["Quantity", "UnitPrice"],
    outputCol="features"
)

final_df = assembler.transform(clean_df)
final_df.select("features", "TotalValue").show(5)


+----------+------------------+
|  features|        TotalValue|
+----------+------------------+
|[6.0,2.55]|15.299999999999999|
|[6.0,3.39]|             20.34|
|[8.0,2.75]|              22.0|
|[6.0,3.39]|             20.34|
|[6.0,3.39]|             20.34|
+----------+------------------+
only showing top 5 rows



In [8]:
pandas_df = final_df.select("Quantity", "UnitPrice", "TotalValue").toPandas()
pandas_df.to_csv("Cleaned_Online.csv", index=False)

from google.colab import files
files.download("Cleaned_Online.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>