In [0]:
# The SparkContext/SparkSession is the entry point for all Spark operations
# sc = the SparkContext = the execution environment of Spark, only 1 per JVM
# Note that SparkSession is now the entry point (from Spark v2.0)
# This tutorial uses SparkContext (was used prior to Spark 2.0)

from pyspark import SparkContext
# sc = SparkContext(appName = "some_application_name") # You'd normally run this, but in this case, it has already been created in the Databricks' environment

In [0]:
quote = "To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer The slings and arrows of outrageous fortune, Or to take Arms against a Sea of troubles, And by opposing end them: to die, to sleep No more; and by a sleep, to say we end the heart-ache, and the thousand natural shocks that Flesh is heir to? 'Tis a consummation devoutly to be wished. To die, to sleep, To sleep, perchance to Dream; aye, there's the rub, for in that sleep of death, what dreams may come, when we have shuffled off this mortal coil, must give us pause."

https://spark.apache.org/docs/2.1.1/programming-guide.html#parallelized-collections

In [0]:
sparkdata = sc.parallelize(quote.split(' '))

In [0]:
sparkdata

In [0]:
print ("sparkdata = ", sparkdata)
print ("sparkdata.collect() = ", sparkdata.collect()[1:10])

In [0]:
# A simple transformation - map
def mapword(word):
  return (word,1)

In [0]:
print (sparkdata.map(mapword) )# Nothing has happened here
print (sparkdata.map(mapword).collect()[1:10]) # collect causes the DAG to execute

In [0]:
wordCountsCollected = (sparkdata
 .map(mapword)
 .reduceByKey(lambda a,b: a+b)
 .collect())

In [0]:
print (wordCountsCollected)

In [0]:
ДЗ1:
  Написать функцию для Map, которая вернет все символы длинее 2.

In [0]:
def charsmorethan2(tuple1):
  if len(tuple1[0]) > 2:
    return tuple1

In [0]:
rdd3 = sparkdata.map(mapword).filter(lambda x: charsmorethan2(x))
# Multiple Transformations in 1 statement, nothing is happening yet
rdd3.collect()[1:10] 
# The DAG gets executed. Note that since we didn't remove punctuation marks ... 'be,', etc are also included

In [0]:
# With Tables, a general example
cms = sc.parallelize([[1,"Dr. A",12.50,"Yale"],[2,"Dr. B",5.10,"Duke"],[3,"Dr. C",200.34,"Mt. Sinai"],[4,"Dr. D",5.67,"Duke"],[1,"Dr. E",52.50,"Yale"]])

In [0]:
def findPayment(data):
  return data[2]

In [0]:
print ("Payments = ", cms.map(findPayment).collect())
print ("Mean = ", cms.map(findPayment).mean() )# Mean is an action

ДЗ2: 
  В таблице выше найдите  стандартное отклонение, минимальное, максимальное значения, сумму всех выплат.

In [0]:
print ("Payments = ", cms.map(findPayment).collect())
print ('Standard Deviation = ', cms.map(findPayment).stdev())
print ('Minimal Value = ', cms.map(findPayment).min())
print ('Maximum Value = ', cms.map(findPayment).max())
print ('Total Sum = ', cms.map(findPayment).sum())

In [0]:
import pyspark.sql.functions as func

# Creating a DataFrame (familiar to Python programmers)
cms_df = sqlContext.createDataFrame(cms, ["ID","Name","Payment","Hosp"])
print (cms_df.show())
print (cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment')))
print (cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment')).collect())
print()
print( "Converting to a Pandas DataFrame")
print( "--------------------------------")
pd_df = (cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment')).toPandas())
print (type(pd_df))
print()
print (pd_df)


In [0]:
wordsList = ['to','be','or','not','to','be']
wordsRDD = sc.parallelize(wordsList, 3) # Splits into 2 groups
# Print out the type of wordsRDD
print (type(wordsRDD))

In [0]:
wordsRDD.collect()

In [0]:
# An example with changing the case of words
# One way of completing the function
def makeUpperCase(word):
  return word.upper()

print (makeUpperCase('cat'))

In [0]:
upperRDD = wordsRDD.map(makeUpperCase)
print( upperRDD.collect())


In [0]:
upperLambdaRDD = wordsRDD.map(lambda word: word.upper())
print (upperLambdaRDD.collect())