
# Transformations, Actions, and Visualizations

In [0]:
dbutils.help()

In [0]:
dbutils.fs.ls("dbfs:/")

# Out[2]: [FileInfo(path='dbfs:/ /', name=' /', size=0),
#  FileInfo(path='dbfs:/%(s3_ddl_location)s/', name='%(s3_ddl_location)s/', size=0),

In [0]:
dbutils.fs.ls("/databricks-datasets")

# Out[3]: [FileInfo(path='dbfs:/databricks-datasets/COVID/', name='COVID/', size=0),
#  FileInfo(path='dbfs:/databricks-datasets/README.md', name='README.md', size=976),
#  FileInfo(path='dbfs:/databricks-datasets/Rdatasets/', name='Rdatasets/', size=0),
#  FileInfo(path='dbfs:/databricks-datasets/SPARK_README.md', name='SPARK_README.md', size=3359),
#  FileInfo(path='dbfs:/databricks-datasets/adult/', name='adult/', size=0),

In [0]:
dbutils.fs.ls("/databricks-datasets/adult")

In [0]:
%fs

ls /databricks-datasets/adult

path,name,size
dbfs:/databricks-datasets/adult/README.md,README.md,2672
dbfs:/databricks-datasets/adult/adult.data,adult.data,3974305
dbfs:/databricks-datasets/adult/adult.test,adult.test,2003132


In [0]:
adult_census_data = spark.read.csv("dbfs:/databricks-datasets/adult/adult.data", header = True)

type(adult_census_data)

In [0]:
adult_census_rdd = adult_census_data.rdd
type(adult_census_rdd)

In [0]:
adult_census_rdd.collect()

In [0]:
adult_census_rdd.count()

In [0]:
adult_census_rdd.first()

In [0]:
adult_census_rdd.map(lambda row : (row[1], row[3], row[5]))

In [0]:
adult_census_rdd.map(lambda row : (row[1], row[3], row[5])).collect()

# Out[11]: [(' Self-emp-not-inc', ' Bachelors', ' Married-civ-spouse'),
#  (' Private', ' HS-grad', ' Divorced'),
#  (' Private', ' 11th', ' Married-civ-spouse'),
#  (' Private', ' Bachelors', ' Married-civ-spouse'),
#  (' Private', ' Masters', ' Married-civ-spouse'),

In [0]:
adult_census_rdd.map(lambda row : (row[' State-gov'], row[' Adm-clerical'], row[' <=50K'])).collect()

# Out[12]: [(' Self-emp-not-inc', ' Exec-managerial', ' <=50K'),
#  (' Private', ' Handlers-cleaners', ' <=50K'),
#  (' Private', ' Handlers-cleaners', ' <=50K'),
#  (' Private', ' Prof-specialty', ' <=50K'),
#  (' Private', ' Exec-managerial', ' <=50K'),
#  (' Private', ' Other-service', ' <=50K'),
#  (' Self-emp-not-inc', ' Exec-managerial', ' >50K'),

In [0]:
adult_census_rdd_filtered = adult_census_rdd.filter(lambda row: row[' <=50K'] == ' <=50K')

In [0]:
adult_census_rdd_filtered.count()


### Transformations and actionis on dataframes

In [0]:
dbutils.fs.ls("/databricks-datasets/bikeSharing/")

In [0]:
%fs

ls /databricks-datasets/bikeSharing/data-001/

path,name,size
dbfs:/databricks-datasets/bikeSharing/data-001/day.csv,day.csv,57569
dbfs:/databricks-datasets/bikeSharing/data-001/hour.csv,hour.csv,1156736


In [0]:
bike_sharing_data = spark.read.format('csv') \
                         .option('inferschema',True) \
                         .option('header', True) \
                         .option('sep', ",") \
                         .load("/databricks-datasets/bikeSharing/data-001/day.csv")

In [0]:
bike_sharing_data.show(10)

In [0]:
bike_sharing_data_selected = bike_sharing_data.select('season', 'holiday', 'cnt')

In [0]:
bike_sharing_data_selected.show()

In [0]:
bike_sharing_data.filter(bike_sharing_data['cnt'] > 1000).show()


### Uploading a Dataset to DBFS using notebooks
###### Settings -> Admin console -> Workspace Settings -> Advanced Section (Enable - DBFS File Browser) -> Refresh page -> Data Left Panel -> Two Section (Database Tables | DBFS)

In [0]:
# File -> Upload Data -> /FileStore/ to upload the data
#OR

df1 = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/rshukla@atlassian.com/credit_train.csv")  #headers won't be avail 

In [0]:
credit_data = spark.read.format("csv").option("inferSchema", True).option("header", True).option("sep", ",").load("dbfs:/FileStore/shared_uploads/rshukla@atlassian.com/credit_train.csv")


### Basic Selection and Filtering Operations

In [0]:
credit_data.count()

In [0]:
credit_data_subset = credit_data.select('Loan Status','Current Loan Amount', 'Annual Income').limit(10).show()

In [0]:
credit_data.filter(credit_data['Annual Income'].isNull()).show()

In [0]:
credit_data = credit_data.dropna()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Home Ownership', 'Bankruptcies').where(credit_data.Bankruptcies > 0).show()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Home Ownership', 'Bankruptcies')\
            .filter((credit_data['Annual Income'] > 100000) & (credit_data['Tax Liens'] > 0)).show()

In [0]:
credit_data.select('Customer ID', 'Home Ownership')\
            .filter(credit_data['Home Ownership'].isin(['Home Mortgage', 'Rent']))\
            .show()


### Writing CSV Files out to DBFS

In [0]:
credit_data.select('Customer ID', 'Annual Income','Monthly Debt') \
           .withColumnRenamed('Annual Income', 'Income') \
           .withColumnRenamed('Monthly Debt', 'Monthly Debt Payment')\
           .show()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Monthly Debt') \
           .withColumn('Savings', credit_data['Annual Income'] - 12 * credit_data['Monthly Debt'])\
           .show()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Monthly Debt') \
           .orderBy(credit_data['Monthly Debt'].desc())\
           .show()

In [0]:
credit_data.select('Customer ID', 'Years of Credit History', 'Years in current job')\
           .orderBy(credit_data['Years of Credit History']).asc())\
           .show()

In [0]:
credit_data.groupBy('Loan Status').count().show()

In [0]:
credit_data.groupBy('Purpose').agg({'Current Loan Amount': 'sum'}).show()

In [0]:
credit_data.groupBy('Purpose').count().write.csv('dbfs:/FileStore/shared_uploads/rshukla@atlassian.com/count_by_loan_purpose.csv')

In [0]:
print(dbutils.fs.head('dbfs:/FileStore/shared_uploads/rshukla@atlassian.com/count_by_loan_purpose.csv/part-0000****************.csv'))


### Creating a table Using the Databricks UI


### Visualizing data using the display() function

In [0]:
auto_data = spark.sql('SELECT * FROM default.automobile_data')

In [0]:
display(auto_data)


# Modify Data Using Spark Functions


### Reading and Parsing JSON data

In [0]:
people_data = spark.read.option('multiline', False) \
              .json('s3://atl-mgmt-de-dev/rshukla/GSwASonD/people.json')

In [0]:
display(people_data.filter(people_data.age >= 30))

age,city,name
30,NewYork,John
34,Chicago,James
32,Houston,Anna


In [0]:
iris_data = spark.read.option('multiline', False) \
              .json('s3://atl-mgmt-de-dev/rshukla/GSwASonD/iris.json')

In [0]:
display(iris_data)

In [0]:
iris_data = spark.read.option('multiline', True) \
              .json('s3://atl-mgmt-de-dev/rshukla/GSwASonD/iris.json')

In [0]:
display(iris_data)

In [0]:
display(iris_data.select("species").distinct())

species
virginica
versicolor
setosa


### Accessing Nested Fields and List Elements

In [0]:
# Permissive mode allows us to deal with corrupt records during parsing. Permissive sets other fields to null when it needs a corrupted record and puts the malformed string into a new field
employee_data = spark.read.option('multiline', True)\
                .option("mode", "PERMISSIVE")\
                .json('s3://atl-mgmt-de-dev/rshukla/GSwASonD/employees.json')

In [0]:
display(employee_data)

address,contact,gender,id,name,salary
"List(Baltimore, MD)","List(List(watson@commerce.gov, 650-333-3456), List(emily@gmail.com, 238-111-7689))",Female,1,Emily Watson,54000.0
"List(Barton, TN)","List(List(johnsmith@yahoo.com, 425-231-8754))",Male,2,John Smith,67000.0
"List(Salt Lake City, UT)","List(List(peter@radio.us, null), List(peterjones@yahoo.com, 425-213-0987))",Male,3,Peter Jones,45000.0
"List(Seattle, WA)","List(List(nina@hotmail.com, 813-190-3628), List(ninajames@hotmail.com, 813-456-6509))",Female,4,Nina James,95500.0


In [0]:
display(employee_data.select('name', 'salary', 'address', 'contact'))

name,salary,address,contact
Emily Watson,54000.0,"List(Baltimore, MD)","List(List(watson@commerce.gov, 650-333-3456), List(emily@gmail.com, 238-111-7689))"
John Smith,67000.0,"List(Barton, TN)","List(List(johnsmith@yahoo.com, 425-231-8754))"
Peter Jones,45000.0,"List(Salt Lake City, UT)","List(List(peter@radio.us, null), List(peterjones@yahoo.com, 425-213-0987))"
Nina James,95500.0,"List(Seattle, WA)","List(List(nina@hotmail.com, 813-190-3628), List(ninajames@hotmail.com, 813-456-6509))"


In [0]:
display(employee_data.select('name', 'salary', 'address.city', 'address.state'))

name,salary,city,state
Emily Watson,54000.0,Baltimore,MD
John Smith,67000.0,Barton,TN
Peter Jones,45000.0,Salt Lake City,UT
Nina James,95500.0,Seattle,WA


In [0]:
display(employee_data.select('name', 'salary', 'contact.email', 'contact.phone'))

name,salary,email,phone
Emily Watson,54000.0,"List(watson@commerce.gov, emily@gmail.com)","List(650-333-3456, 238-111-7689)"
John Smith,67000.0,List(johnsmith@yahoo.com),List(425-231-8754)
Peter Jones,45000.0,"List(peter@radio.us, peterjones@yahoo.com)","List(null, 425-213-0987)"
Nina James,95500.0,"List(nina@hotmail.com, ninajames@hotmail.com)","List(813-190-3628, 813-456-6509)"


In [0]:
from pyspark.sql import functions as F

In [0]:
display(employee_data.select(F.col('contact.email').getItem(0).alias('email_address')))

email_address
watson@commerce.gov
johnsmith@yahoo.com
peter@radio.us
nina@hotmail.com


In [0]:
display(employee_data.select('name',
                             F.col('contact.email').getItem(0).alias('email_address'),
                             F.col('contact.phone').getItem(0).alias('phone_number')))

name,email_address,phone_number
Emily Watson,watson@commerce.gov,650-333-3456
John Smith,johnsmith@yahoo.com,425-231-8754
Peter Jones,peter@radio.us,
Nina James,nina@hotmail.com,813-190-3628



### Reading from Azure Data Storage


-https://databricks-instance#secrets/createScope
-vault URI-
-resource ID

In [0]:
spark.conf.set("fs.azure.account.key.loonydatastorage.blob.core.windows.net", 
              dbutils.secrets.get(scope = "loonydatabrickssecretscope", key = "loonydatasecretkey"))

In [0]:
netflix_data = spark.read.csv("wasbs://loonydatacontainer@loonydatastorage.blob.core.windows.net/datasets/netflix_data.csv", header = "true")

In [0]:
netflix_data = spark.read.csv("s3://atl-mgmt-de-dev/rshukla/GSwASonD/netflix_list.csv", header = "true")

In [0]:
display(netflix_data)

In [0]:
#Accessing this data using the long storage acount URL is kind of clunky. We can also mount our storage account so that we can access this data from within our databrticks File System

dbutils.fs.mount(
  source = "wasbs://loonydatacontainer@loonydatastorage.blob.core.windows.net",
  mount_point = "/mnt/",
  extra_configs = {"fs.azure.account.key.loonydatastorage.blob.core.windows.net":
                  dbutils.secrets.get(scope = "loonydatabrickssecretscope", key = "loonydatasecretkey")
  }
)

In [0]:
dbutils.fs.ls('/mnt/datasets/netflix_list.csv')

In [0]:
netflix_data = spark.read.format("csv").option("inferSchema", "True").option("header", True).option("sep", ",").load("/mnt/datasets/netflix_list.csv")

display(netflix_data)


### Built-in Functions

In [0]:
netflix_data = spark.read.format("csv")\
               .option("inferSchema", True)\
               .option("header", True)\
               .option("sep", ",")\
               .load("s3://atl-mgmt-de-dev/rshukla/GSwASonD/netflix_list.csv")

In [0]:
from pyspark.sql.functions import initcap, upper, lower

display(netflix_data.select('title', upper('title'), initcap('title'), lower('genres')))

title,upper(title),initcap(title),lower(genres)
Army of the Dead,ARMY OF THE DEAD,Army Of The Dead,"action,crime,horror"
The Woman in the Window,THE WOMAN IN THE WINDOW,The Woman In The Window,"crime,drama,mystery"
The Mitchells vs the Machines,THE MITCHELLS VS THE MACHINES,The Mitchells Vs The Machines,"adventure,animation,comedy"
Blue Miracle,BLUE MIRACLE,Blue Miracle,"adventure,biography,drama"
Things Heard & Seen,THINGS HEARD & SEEN,Things Heard & Seen,"drama,horror,mystery"
Stowaway,STOWAWAY,Stowaway,"drama,sci-fi,thriller"
Don't Look Up,DON'T LOOK UP,Don't Look Up,comedy
I Am All Girls,I AM ALL GIRLS,I Am All Girls,"crime,drama,mystery"
Hotel Transylvania: Transformania,HOTEL TRANSYLVANIA: TRANSFORMANIA,Hotel Transylvania: Transformania,"adventure,animation,comedy"
The Dig,THE DIG,The Dig,"biography,drama,history"


In [0]:
from pyspark.sql.functions import regexp_replace

display(netflix_data.select('title', 'origin_country', regexp_replace('language', 'En*', 'en')))

title,origin_country,"regexp_replace(language, En*, en, 1)"
Army of the Dead,United States,english
The Woman in the Window,United States,english
The Mitchells vs the Machines,United States,english
Blue Miracle,United States,english
Things Heard & Seen,United States,english
Stowaway,Germany,english
Don't Look Up,United States,english
I Am All Girls,South Africa,english
Hotel Transylvania: Transformania,United States,english
The Dig,United Kingdom,english


In [0]:
from pyspark.sql.functions import translate

display(netflix_data.select('title', translate('title', 'io', '10')))

title,"translate(title, io, 10)"
Army of the Dead,Army 0f the Dead
The Woman in the Window,The W0man 1n the W1nd0w
The Mitchells vs the Machines,The M1tchells vs the Mach1nes
Blue Miracle,Blue M1racle
Things Heard & Seen,Th1ngs Heard & Seen
Stowaway,St0waway
Don't Look Up,D0n't L00k Up
I Am All Girls,I Am All G1rls
Hotel Transylvania: Transformania,H0tel Transylvan1a: Transf0rman1a
The Dig,The D1g


In [0]:
netflix_data.stat.corr('rating', 'numVotes')

In [0]:
netflix_data.stat.cov('rating', 'numVotes')