In [1]:
# I'm using Ubuntu 20.04, Python 3.8.10, and Spark 3.4.0

In [None]:
import time
import pyspark
import os
import glob
import shutil

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType
from pyspark.sql.window import Window

from pyspark.sql.functions import (
    concat,
    col, 
    lit, 
    row_number, 
    monotonically_increasing_id, 
    to_date, 
    to_utc_timestamp, 
    date_format, 
    lpad,
    from_unixtime,
    unix_timestamp,
    to_timestamp,
    expr
)

In [2]:
start_time = time.time()
print(f"Spark version: {pyspark.__version__}")

Spark version: 3.4.0


In [None]:
conf = SparkConf() \
    .setAppName("CSV Cleaning") \
    .setMaster("local") \
    .set("spark.driver.extraClassPath","/home/nicolasterroni/projects/spark/jars/*")
# you must specify the jars location

sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

In [None]:
# your csv location
main_path = '/home/nicolasterroni/projects/spark/examples/csv_cleaning/'

# your csv filename
csv_name = 'test.csv'

# ensure that the delimiter is correct
df = spark.read.format('csv').option('header', True).option('delimiter', '|').load(main_path+csv_name)
df.printSchema()

In [None]:
# check the initial data
df.head(3)

In [6]:
# DATA MODELING

# example of joining 2 dataframes
# df = df.join(df2, on="id", how="inner")

# delete column
# df = df.drop("column")

# converting 'revenue' column from string like '55.000,00' to float
#df = df.withColumn('revenue', col('revenue').replace('.', ''))
#df = df.withColumn('revenue', col('revenue').replace(',', '.'))
#df = df.withColumn('revenue', col('revenue').cast('float'))

# check the transformed data
# df.printSchema()
# df.head(3)


In [None]:
# DATA CLEANING

# examples of changing dates and datetimes format
#df = df.withColumn("date", from_unixtime(unix_timestamp(df["date"], "d/M/yyyy"), "yyyy-MM-dd"))
#df = df.withColumn("datetime", from_unixtime(unix_timestamp(df["datetime"], "d/M/yyyy HH:mm:ss"), "yyyy-MM-dd HH:mm:ss"))

# example of string manipulation, geting a substring
#df = df.withColumn("string", expr("substring(string, 1, length(string) - 4)"))

# convert string to timestamp
#df = df.withColumn("datetime", to_timestamp(df["datetime"], "yyyy/MM/dd HH:mm:ss"))

# check the transformed data
# df.printSchema()
# df.head(3)

In [10]:
# Spark's parallel processing writes each fragment of the csv by separated, we need to join them
# I added the 'clean_' prefix to recognize the cleaned csv from the original

# set a temporary save path
save_path = main_path + "/tmp/" + csv_name[:-4]

# ensure to specify your wanted delimiter
df.coalesce(1).write.options(header='True', delimiter='|').mode("overwrite").csv(save_path)

for file in os.listdir(save_path):
    if not file.endswith(".csv"):
        os.remove(save_path+'/'+file)
    else:
        os.rename(save_path+'/'+file, main_path+'/'+"clean_"+csv_name)

shutil.rmtree(main_path + "/tmp")
spark.stop()

end_time = time.time()
total_time = end_time - start_time
print(f"The process took {total_time} seconds.")

                                                                                

The process took 353.34016513824463 seconds.


In [None]:
"""
3.5gb CSV CLEANED AND RE-WRITTEN IN -----------> 353.3 SECONDS
"""