####Practice "Data Munching"

######Passive Data Munching

In [0]:

#Read the data, create a data frame and identify its pattern
rawdf=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/BB_Practice data/custsmodified").toDF("id","firstname","lastname","age","profession")
#rawdf.show(20,False)
print("display using take function")
display(rawdf.take(20))
print("display using sampling")
display(rawdf.sample(.1))

#Finding data structure
print("print schema")
rawdf.printSchema() # this shows every col as string which means id and age has non numeric
print("identifying Schema")
print(rawdf.schema) # prints schema structure, we can copy them and define a struct.
print("print columns")
print(rawdf.columns)
print("print data types")
print(rawdf.dtypes)

#write datatype identification programatically
for i in rawdf.dtypes:
    if i[1]=="string":
        print(i[0])

#Finding total row count and duplicate row 
print("actual count of data", rawdf.count()) #10005
print("de-duplication of all columns",rawdf.distinct().count()) #10004
print("de-duplication of records using dropduplicate",rawdf.dropDuplicates().count())
print("de-duplicate of specified column",rawdf.dropDuplicates(['id']).count())
print(rawdf.describe())
print(rawdf.summary())

######Active Data munching

In [0]:
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName("Wd-36 ETA Pipeline BB Practice").getOrCreate()
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ShortType, DataType, DateType, time
#Merge schema & Schema Evolution - 2 different files on same day with different schema, hence create 2 different schema and use unionByName to merge the schema, which ever is missing column, Null is filled in there.
#Union is used when there is same datatype &  present on both files.
struct1=StructType([StructField('id', StringType(), True), StructField('firstname', StringType(), True), StructField('lastname', StringType(), True), StructField('age', StringType(), True), StructField('profession', StringType(), True),StructField('corruptedrows', StringType(), True)])
rawdf1=spark.read.schema(struct1).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/",pathGlobFilter="custsmodified_N*",recursiveFileLookup=True)
display(rawdf1)
struct2=StructType([StructField('id', StringType(), True), StructField('firstname', StringType(), True), StructField('age', StringType(), True), StructField('profession', StringType(), True),StructField('city', StringType(), True)])
rawdf2=spark.read.schema(struct2).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/",pathGlobFilter="custsmodified_T*",recursiveFileLookup=True)
display(rawdf2)
rawdf_merged = rawdf1.unionByName(rawdf2,allowMissingColumns=True)
display(rawdf_merged)

#rawdf_merged = rawdf1.union(rawdf2)
#display(rawdf_merged)#error - [NUM_COLUMNS_MISMATCH] UNION can only be performed on inputs with the same number of columns, but the first input has 5 columns and the second input has 6 columns. SQLSTATE: 42826


######Validation
######-count: gives entire data count
######-len(cleandf(collect())): gives only clean record count removing malformed
######-display: displays in df

In [0]:
#Mode function- permissive/dropMalformed/
cleandf = spark.read.schema(struct1).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/BB_Practice data/custsmodified",mode="permissive")
print("count displays entire data count",cleandf.count()) #shows all 10005
display(cleandf) 
#or 
cleandf = spark.read.schema(struct1).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/BB_Practice data/custsmodified",mode="dropMalformed")
print("count displays data count after removing malformed data",len(cleandf.collect())) # this is costly since it reads all row length and gives the counts, shows 10002 removing 3 malformed records
display(cleandf)


######Rejection strategy: remove all corrupted records/datasets

In [0]:
#in permissive mode separate corrupted datasets from good ones
cleandf = spark.read.schema(struct1).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/BB_Practice data/custsmodified",mode="permissive",
columnNameOfCorruptRecord="corruptedrows")
cleandf.printSchema()
rejecteddf = cleandf.where("corruptedrows is not null")
display(rejecteddf)
rejecteddf.write.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/custrejected",mode="overwrite",header=True)
retaineddf = cleandf.where("corruptedrows is null")
display(retaineddf)
print("show the entire data count",len(cleandf.collect()))
print("shows all rejected rows",len(rejecteddf.collect()))
print("shows all retained count after cleaning",len(retaineddf.collect()))


######cleaning - making data clean 
na.drop()

In [0]:
cleanseddf = rawdf.na.drop(how="any") # all dataset with null col are dropped.
print("cleanseddf with all null col dropped",len(cleanseddf.collect())) #9913
display(rawdf.where("age is null"))
print("cleanseddf with any col. null")
display(cleanseddf.where("id is null")) #returns no records which means all null datasets are dropped as expected
# using subset to drop datasets based on specific columns while other columns with null are retained.
print("cleansed data with any id col having null are dropped while other fields with null are retained")
cleanseddf = rawdf.na.drop(how="any",subset=["id"])
display(cleanseddf.where("profession is null"))
print("count after eliminating all datasets having null in id and lastname",len(cleanseddf.collect())) #9998 eliminates only null datasets
cleanseddf = rawdf.na.drop(how="any",subset=["firstname","lastname"])
print("count after eliminating all datasets having null in firstname and lastname",len(cleanseddf.collect()))

######Scrubbing
na.fill() & na.replace()

In [0]:
scrubbeddf = cleanseddf.na.fill("not provided",subset=["age","lastname","profession"])
display(scrubbeddf.where ("age is null"))
scrubbeddf2 = scrubbeddf.na.replace("not provided","NA",subset=["age","lastname","profession"])
display(scrubbeddf2)
replace_scrubbed_data = {"ten":"4000005"}
scrubbeddf3 = scrubbeddf2.na.replace(replace_scrubbed_data,subset=["id"])
display(scrubbeddf3)
#display(scrubbeddf3.where("age is NA"))

######Deduplicate
data.distinct() or data.dropduplicates()

In [0]:
display(scrubbeddf2.where("id in ('4000001')")) # 2 records displayed
dedupdf = scrubbeddf2.distinct()
display(dedupdf.where("id in ('4000001')")) # 1 is dropped and distinct record shown
dedupdf1 = dedupdf.coalesce(1).dropDuplicates(subset=["id"])
display(dedupdf1.where("id in ('4000003')"))
#display(dedupdf.coalesce(1).where("id in ('4000003')").orderBy(["id","age"],Descending=[False,False]))#need to check same value is returned with asc & desc
dedupdf2 = (scrubbeddf2.coalesce(1).orderBy(["id","age"],Ascending=[True,True]).dropDuplicates(subset=["id"]))
display(dedupdf2.where("id in ('4000003')"))


######Standardization
withColumn/s(), withColumn/sRenamed(), regexp_replace(), select()

In [0]:
#Adding new column
from pyspark.sql.functions import lit, initcap, col, upper, rlike, regexp_replace, replace

stddf = dedupdf2.withColumn("sourcesystem", lit("Retails"))
display(stddf.limit(10))

#Data uniformity
stddf1 = stddf.withColumn("profession",initcap(col("profession")))
display(stddf1.limit(10))

#Format standardization
stddf1.where("id rlike '[a-zA-Z]'").show()
stddf1.where("age rlike '[^0-9]'").show()

std_replace_data = {"ten":"4000005"}
stddf2 = scrubbeddf3.na.replace(std_replace_data,subset=["id"])
display(stddf2.where ("id in ('4000005')"))
display(stddf2)
stddf2 = stddf2.withColumn("age",regexp_replace(col("age"),"-",""))
display(stddf2) #7-7 is replaced as 77, this is expression replacement

#Datatype conversion
stddf2.printSchema()
stddf3 = stddf2.withColumn("id",col("id").cast("long"))
stddf3.printSchema()
stddf3 = stddf3.withColumn("age",col("age").cast("short"))
stddf3.printSchema()
stddf3 = stddf3.withColumn("sourcesystem", lit("Retails"))

#using withColumn/sRenamed() to rename the header column
stddf4 = stddf3.withColumnsRenamed({"id":"custid","profession":"prof"})
display(stddf4)

# to see columns in different order from given order use "select"
stddf5 = stddf4.select('custid','firstname','lastname','prof','age','sourcesystem')

# before egress check data quality & data integrity
mungeddf = stddf5
mungeddf.printSchema()
print("row count",len(mungeddf.collect()))
print(mungeddf.schema)
display(mungeddf.summary)

stddf5.write.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/BB_Practice data/custsmodified_target",mode="overwrite",header=True)