In [30]:
import com.ibm.spark.ingest.Extensions._

# Data preparation tutorial
In this tutorial, we will use Scala to discover drug and customer data, examine data profiles, view value distributions, and assess data quality. Each section will prepare the data for further analysis

## Part 1: Discover drug data from a directory of multiple file types
The source directory /data/sampleDataDir contains several JSON files of drug data and one CSV file of customer data.
File format discovery will load the most occuring type of data by default. In this case, the JSON drug data will be loaded.

In [31]:
import sys.process._
import java.net.URL
import java.io.File

def fileDownloader(url: String, filename: String) = {
    new URL(url) #> new File(filename) !!
}

fileDownloader("https://ibm.box.com/shared/static/9nxnsf6xwmuczjea911xjxp8l21yyd2x.zip", "/resources/Sparklingdataset.zip")

In [32]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

In [33]:
val df = sqlContext.read.format("com.ibm.spark.discover").load("/resources/data/sparklingdata/data/sampleDataDir")

## Visualize the drug data
Now that the drug data is loaded, use this script to provide different formats for visualizing the data.


In [34]:
df.printSchema()
df.show()

root
 |-- Age: long (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholestrol: string (nullable = true)
 |-- Cost: string (nullable = true)
 |-- DateTested: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Ent#: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Potassium: double (nullable = true)
 |-- Sodium: double (nullable = true)



## Visualize data types and distribution profiles
Run the following script to produce four interactive charts so that we can explore the distribution profiles of the data.

In [35]:
val dfProfile = df.inferTypes.profile
dfProfile.printSchema
dfProfile.printProfile

root
 |-- Age: long (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholestrol: string (nullable = true)
 |-- Cost: short (nullable = true)
 |-- DateTested: timestamp (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Ent#: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Potassium: double (nullable = true)
 |-- Sodium: double (nullable = true)



# Investigate and fix any bad data
As we can see in the charts above, the numeric distributions chart shows 13 nulls in the Cost column. Let us investigate this further and fix any bad data.

In [36]:
val dfBaddata = df.enrich.option("revealNA", Map("mode" -> "any", "brackets" -> (">[", "]<"))).inferTypes
dfBaddata.show

In [37]:
val dfCostBaddata = df.enrich.option("columns", Array("Cost")).option("revealNA", Map("mode" -> "any","type" -> ("struct"))).inferTypes
dfCostBaddata.show

###  Fix bad data

In [38]:
val drugDfForAnalysis_1 = df.enrich.
  option("columns", Array("Cost")). 
  option("locale", "es").
  option("groupingUsed", true).
  option("extractFields",false).inferTypes
drugDfForAnalysis_1.printSchema
drugDfForAnalysis_1.show
drugDfForAnalysis_1.printTypes

root
 |-- Age: long (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholestrol: string (nullable = true)
 |-- Cost: string (nullable = true)
 |-- DateTested: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Ent#: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Potassium: double (nullable = true)
 |-- Sodium: double (nullable = true)

+---+------+----------+--------+-------------------+-----+----+------+---------+--------+
|Age|    BP|Cholestrol|    Cost|         DateTested| Drug|Ent#|Gender|Potassium|  Sodium|
+---+------+----------+--------+-------------------+-----+----+------+---------+--------+
| 23|  HIGH|      HIGH|      11|1980-10-11T10:10:20|drugY|   1|     F| 0.031258|0.792535|
| 47|   LOW|      HIGH|      14|1980-11-14T09:10:20|drugC|   2|     M| 0.056468|0.739309|
| 47|   LOW|      HIGH|      14|1980-11-14T09:10:20|drugC|   3|     M| 0.068944|0.697269|
| 28|NORMAL|      HIGH|   05,14|1990-05-14T05:12:05|drugX|   4|     F| 0.072289|

In [39]:
val drugDfForAnalysis = df.enrich.
  option("locale", "es").
  option("groupingUsed", true).
  option("extractFields", true).
  inferTypes
  
 drugDfForAnalysis.printSchema
drugDfForAnalysis.show

root
 |-- Age: long (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholestrol: string (nullable = true)
 |-- Cost: float (nullable = true)
 |-- DateTested: timestamp (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Ent#: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Potassium: double (nullable = true)
 |-- Sodium: double (nullable = true)



## Explore the statistics and distributions

In [53]:
val discovered_types = dfProfile.render("types")
discovered_types.show

+---------+-------------------+----------+
|     name|              value|    column|
+---------+-------------------+----------+
|     Long|                1.0|       Age|
|   String|                1.0|        BP|
|   String|                1.0|Cholestrol|
|    Short|0.16666666666666666|      Cost|
|   String|0.43333333333333335|      Cost|
|     Byte|                0.4|      Cost|
|Timestamp|                1.0|DateTested|
|   String|                1.0|      Drug|
|     Long|                1.0|      Ent#|
|   String|                1.0|    Gender|
|   Double|                1.0| Potassium|
|   Double|                1.0|    Sodium|
+---------+-------------------+----------+



In [60]:
val discovered_types_Cost = discovered_types.select("name","value", "column").filter("column = 'Cost'")
discovered_types_Cost.show

+------+-------------------+------+
|  name|              value|column|
+------+-------------------+------+
| Short|0.16666666666666666|  Cost|
|String|0.43333333333333335|  Cost|
|  Byte|                0.4|  Cost|
+------+-------------------+------+



In [61]:
val discovered_stats = dfProfile.render("stats")
discovered_stats.filter(discovered_stats("column") === "Cost").show

+--------+------------------+------+
|    name|             value|column|
+--------+------------------+------+
|   count|              17.0|  Cost|
|    mean|162.76470588235293|  Cost|
|     min|              11.0|  Cost|
|     max|             811.0|  Cost|
|   range|             800.0|  Cost|
|  stddev|               NaN|  Cost|
|variance|               NaN|  Cost|
+--------+------------------+------+



In [62]:
val discovered_histogram = dfProfile.render("histogram")
discovered_histogram.filter(discovered_histogram("column") === "Cost").show

+-----------+-----+------+
|       name|value|column|
+-----------+-----+------+
|    NaN-NaN|   13|  Cost|
|  11.0-51.0|    7|  Cost|
|  51.0-91.0|    2|  Cost|
| 91.0-131.0|    3|  Cost|
|131.0-171.0|    0|  Cost|
|171.0-211.0|    3|  Cost|
|211.0-251.0|    0|  Cost|
|251.0-291.0|    0|  Cost|
|291.0-331.0|    0|  Cost|
|331.0-371.0|    0|  Cost|
|371.0-411.0|    0|  Cost|
|411.0-451.0|    0|  Cost|
|451.0-491.0|    0|  Cost|
|491.0-531.0|    0|  Cost|
|531.0-571.0|    0|  Cost|
|571.0-611.0|    0|  Cost|
|611.0-651.0|    0|  Cost|
|651.0-691.0|    0|  Cost|
|691.0-731.0|    0|  Cost|
|731.0-771.0|    0|  Cost|
+-----------+-----+------+
only showing top 20 rows



In [59]:
val discovered_labels = dfProfile.render("labels")
discovered_labels.show

+------+-----+----------+
|  name|value|    column|
+------+-----+----------+
|   LOW|   16|        BP|
|NORMAL|    7|        BP|
|  HIGH|    7|        BP|
|  HIGH|   20|Cholestrol|
|NORMAL|   10|Cholestrol|
| drugY|   19|      Drug|
| drugC|    5|      Drug|
| drugX|    5|      Drug|
| drugA|    1|      Drug|
|     F|   16|    Gender|
|     M|   14|    Gender|
+------+-----+----------+



In [58]:
val discovered_datetime = dfProfile.render("datetime")
discovered_datetime.show

+--------------------+-----+----------+
|                name|value|    column|
+--------------------+-----+----------+
|1980-10-11 10:10:...|   24|DateTested|
|1982-03-17 12:19:...|    0|DateTested|
|1983-08-21 14:28:...|    0|DateTested|
|1985-01-24 16:37:...|    0|DateTested|
|1986-06-30 18:46:...|    0|DateTested|
|1987-12-04 20:55:...|    0|DateTested|
|1989-05-09 23:04:...|    5|DateTested|
|1990-10-14 01:13:...|    0|DateTested|
|1992-03-19 03:23:...|    0|DateTested|
|1993-08-23 05:32:...|    0|DateTested|
|1995-01-27 07:41:...|    0|DateTested|
|1996-07-02 09:50:...|    0|DateTested|
|1997-12-06 11:59:...|    0|DateTested|
|1999-05-12 14:08:...|    0|DateTested|
|2000-10-15 16:17:...|    0|DateTested|
|2002-03-21 18:26:...|    0|DateTested|
|2003-08-25 20:35:...|    0|DateTested|
|2005-01-28 22:44:...|    0|DateTested|
|2006-07-05 00:53:...|    0|DateTested|
|2007-12-09 03:02:...|    1|DateTested|
+--------------------+-----+----------+



## Store data frame to disk and verify

In [46]:
drugDfForAnalysis.write.format("com.ibm.spark.discover").save("/resources/data/sparklingdata/data/sampleDataDir/temp/drugDfForAnalysis.json")

In [47]:
val dfSavedDrugInfo2015 = sqlContext.read.format("com.ibm.spark.discover").load("/resources/data/sparklingdata/data/sampleDataDir/temp/drugDfForAnalysis.json")
dfSavedDrugInfo2015.printSchema()
dfSavedDrugInfo2015.show()

Not available
root
 |-- Age: long (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholestrol: string (nullable = true)
 |-- Cost: double (nullable = true)
 |-- DateTested: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Ent#: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Potassium: double (nullable = true)
 |-- Sodium: double (nullable = true)



# Part 2: Explore customer data and prepare for analysis
Now that we have explored our drug data, let us explore our customer data.


In [48]:
val dfCust = sqlContext.read.format("com.ibm.spark.discover").load("/resources/data/sparklingdata/data/sampleDataDir/customers.csv")
dfCust.printSchema
dfCust.show

Not available
root
 |-- C0: string (nullable = true)
 |-- C1: string (nullable = true)
 |-- C2: string (nullable = true)
 |-- C3: string (nullable = true)
 |-- C4: string (nullable = true)



In [49]:
val dfcsv_infer = dfCust.enrich.option("semanticTypes", true).option("extractFields", true).inferTypes
dfcsv_infer.printSchema
dfcsv_infer.show

root
 |-- C0: string (nullable = true)
 |-- C1: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |    |-- person: string (nullable = true)
 |-- C2: struct (nullable = true)
 |    |-- organization: string (nullable = true)
 |-- C3: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- stateorprovince: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |    |-- address: string (nullable = true)
 |-- C4: float (nullable = true)



In [50]:
val dfcsv_reveal = dfCust.enrich.option("semanticTypes", true).option("revealNA", Map("mode" -> "any", "brackets" -> (">[", "]<"))).inferTypes
dfcsv_reveal.show

In [51]:
val dfCustProfiled = dfCust.enrich.option("semanticTypes", true).inferTypes.profile
dfCustProfiled.printTypes
dfCustProfiled.printProfile

C0:{"inferred_type":"String","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[String],"DiscoveredDataTypePercentages":[1.0]}
C1:{"inferred_type":"Person","inferred_occurrence":"96","threshold%":"50","DiscoveredDataTypes":[Person, String],"DiscoveredDataTypePercentages":[0.967741935483871, 0.03225806451612903]}
C2:{"inferred_type":"Organization","inferred_occurrence":"80","threshold%":"50","DiscoveredDataTypes":[Organization, String],"DiscoveredDataTypePercentages":[0.8064516129032258, 0.1935483870967742]}
C3:{"inferred_type":"Address","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[Address],"DiscoveredDataTypePercentages":[1.0]}
C4:{"inferred_type":"Float","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[Short, Float],"DiscoveredDataTypePercentages":[0.8064516129032258, 0.1935483870967742]}

C0:{"StatsNames":["count","numberOfCategories","mode"],"inferred_type":"String","inferred_occurrence":100,"Percentages":[0.0645161290322

# Summary
This tutorial showed how to discover and explore data and prepare it for further analysis. You can copy this notebook or parts of this notebook into your own notebook and adjust the code as needed.

## Want to learn more?

<a href="http://bigdatauniversity.com/courses/scala-course/?utm_source=tutorial-sparkling-scala&utm_medium=dswb&utm_campaign=bdu"><img src = "https://ibm.box.com/shared/static/qe9ofshd0nrhgcx7620cdf0a6mgn1qd2.png"> </a>

Created by: <a href="https://bigdatauniversity.com/?utm_source=bducreatedbylink&utm_medium=dswb&utm_campaign=bdu">The Cognitive Class Team</a>