### Pre-requisites
* Install dbldatagen from Databricks Labs for data generation

In [0]:
%pip install git+https://github.com/databrickslabs/dbldatagen

In [0]:
#dbutils is a databricks utility function 
dbutils.fs.rm('/tmp/ch8', True)
spark.sql("DROP DATABASE IF EXISTS ch8 CASCADE")
spark.sql("CREATE DATABASE ch8")

### Synthetic Data Generation

In [0]:
import dbldatagen as dg
from pyspark.sql.types import IntegerType, FloatType, StringType

deltaDataPath='/tmp/chapter8/csvToDelta'
df_spec = (dg.DataGenerator(spark, name="test_data_set1", rows=1000000, partitions=4)
            .withIdOutput()
            .withColumn("r", FloatType(), expr="floor(rand() * 350) * (86400 + 3600)", numColumns=10)
            .withColumn("code1", IntegerType(), minValue=100, maxValue=200)
            .withColumn("code2", IntegerType(), minValue=0, maxValue=10)
            .withColumn("code3", StringType(), values=['a', 'b', 'c'])
            .withColumn("code4", StringType(), values=['a', 'b', 'c'], random=True)
            .withColumn("code5", StringType(), values=['a', 'b', 'c'], random=True, weights=[9, 1, 1]))
                            
syntheticData_df = df_spec.build()
syntheticData_df.write.format('delta').mode('overwrite').save(deltaDataPath)
print(syntheticData_df.count())

In [0]:
%sql 
select count(*) from delta.`/tmp/chapter8/csvToDelta`

count(1)
1000000


### Statistical Soundness

In [0]:
dataPath = '/databricks-datasets/learning-spark-v2/people/people-10m.delta'
spark.sql(""" 
    CREATE TABLE IF NOT EXISTS ch8.some_delta_table  
    USING delta  
    OPTIONS (path = '{}') 
  """.format(dataPath)) 

In [0]:
df = spark.read.format('delta').table('some_delta_table')
display(df)

from pyspark.sql.functions import *
df.describe('gender', 'salary').show()

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
3766824,Hisako,Isabella,Malitrott,F,1961-02-12T05:00:00.000+0000,938-80-1874,58862
3766825,Daisy,Merissa,Fibben,F,1998-05-19T04:00:00.000+0000,971-14-3755,66221
3766826,Caren,Blossom,Henner,F,1962-08-06T04:00:00.000+0000,954-19-8973,54376
3766827,Darleen,Gertie,Goodinson,F,1980-03-12T05:00:00.000+0000,981-65-5269,69954
3766828,Kyle,Lu,Habben,F,1974-02-15T04:00:00.000+0000,936-95-3240,56681
3766829,Melia,Kristy,Bonhill,F,1970-09-13T04:00:00.000+0000,960-91-9232,73995
3766830,Yevette,Faye,Bebbell,F,1972-09-07T04:00:00.000+0000,987-72-3701,92888
3766831,Delpha,Kenisha,Gillison,F,1979-06-25T04:00:00.000+0000,962-66-5404,51206
3766832,Mikaela,Jenifer,Hallan,F,1973-05-23T04:00:00.000+0000,911-38-3114,98887
3766833,Cindi,Renita,Cousin,F,1979-03-19T05:00:00.000+0000,666-50-3216,63646


In [0]:
print(df.stat.cov('id', 'salary'))
print(df.stat.corr('id', 'salary'))

In [0]:
from pyspark.mllib.stat import Statistics
import numpy as np

def parse_data(record):
    return np.array([float(x) for x in record])

vector_data = df.select('id', 'salary').rdd.map(parse_data)
summary = Statistics.colStats(vector_data)
print(summary.mean(), summary.variance(), summary.numNonzeros())

print(Statistics.corr(vector_data, method='pearson'))

In [0]:
from pyspark.sql.functions import countDistinct
countDistinctDF = (df.select('gender', 'salary')
                                      .groupBy('gender')
                                      .agg(countDistinct('salary').alias('class')) )
display(countDistinctDF)

gender,class
F,128370
M,127369


In [0]:
from pyspark.ml.stat import KolmogorovSmirnovTest

dataset = [[-1.0], [0.0], [1.0]]
dataset = spark.createDataFrame(dataset, ['sample'])
ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first()
print(ksResult.pValue, ksResult.statistic)

### Compensating for missing & out of range data

In [0]:
columns = ["State","Name", "age"]
data = [("TX","Jack", -1), ("NV","Jane",66), ("CO","Bill",79),("CA","Tom",53), ("WY","Shawn",45)]

age_df = spark.sparkContext.parallelize(data).toDF(columns)
display(age_df)

State,Name,age
TX,Jack,-1
NV,Jane,66
CO,Bill,79
CA,Tom,53
WY,Shawn,45


In [0]:
from pyspark.sql.functions import *
avg_age = age_df.where(col('age')!=-1).agg(avg('age')).first()[0]
age_df=age_df.withColumn("age_new", when(col('age') == -1, lit(avg_age)).otherwise(col('age')))
display(age_df)

State,Name,age,age_new
TX,Jack,-1,60.75
NV,Jane,66,66.0
CO,Bill,79,79.0
CA,Tom,53,53.0
WY,Shawn,45,45.0


In [0]:
from pyspark.ml.feature import Imputer
imputer = (Imputer(inputCols=weather_df.columns, 
                  outputCols=["{}_imputed".format(c) for c in weather_df.columns])
                  .setStrategy("median"))
imputer.fit(weather_df).transform(weather_df).show()

In [0]:
columns = ["temperature","wind"]
data = [{"temperate": 25,"wind": 16},{"temperature": 25},{"wind": 25}]
weather_df = spark.sparkContext.parallelize(data).toDF(columns)
display(weather_df)

temperature,wind
25.0,16.0
,
,25.0


In [0]:
impute_cols = ['temperature', 'wind']
mean_df = weather_df.na.drop().agg(*[avg(c).alias(c) for c in impute_cols])
mean_weather_df = weather_df.na.fill(mean_df.first().asDict())
display(mean_weather_df)

temperature,wind
25,16
25,16
25,25


In [0]:
corruptImputed_df = weather_df.na.fill({"temperature": 10,"wind": 10})
display(corruptImputed_df)

temperature,wind
25,16
10,10
10,25
