In [1]:
# In the first cell, type:
import findspark
findspark.init()


In [2]:
# In the second cell, type:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()


In [3]:
# Create a dataframe, use toDF()
df = spark.range(500).toDF("number")
df.show()


+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



In [4]:
# alter values  in rows
df.select(df["number"] + 10).show()


+-------------+
|(number + 10)|
+-------------+
|           10|
|           11|
|           12|
|           13|
|           14|
|           15|
|           16|
|           17|
|           18|
|           19|
|           20|
|           21|
|           22|
|           23|
|           24|
|           25|
|           26|
|           27|
|           28|
|           29|
+-------------+
only showing top 20 rows



In [5]:
# Filter row such that number module 2 is not equal to 0.
df.filter(df['number'] % 2 != 0).show()

+------+
|number|
+------+
|     1|
|     3|
|     5|
|     7|
|     9|
|    11|
|    13|
|    15|
|    17|
|    19|
|    21|
|    23|
|    25|
|    27|
|    29|
|    31|
|    33|
|    35|
|    37|
|    39|
+------+
only showing top 20 rows



In [6]:
# use rdd to convert to rdd and perform rdd operations
df.rdd.count()


500

In [7]:
# take the first 10 elements
df.rdd.take(10)

[Row(number=0),
 Row(number=1),
 Row(number=2),
 Row(number=3),
 Row(number=4),
 Row(number=5),
 Row(number=6),
 Row(number=7),
 Row(number=8),
 Row(number=9)]

In [8]:
# create a row object
spark.range(10).collect()


[Row(id=0),
 Row(id=1),
 Row(id=2),
 Row(id=3),
 Row(id=4),
 Row(id=5),
 Row(id=6),
 Row(id=7),
 Row(id=8),
 Row(id=9)]

In [9]:
# creating a row
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)
type(myRow)

pyspark.sql.types.Row

In [10]:
# return the first element
myRow[0]

'Hello'

In [11]:
# return the third element
myRow[2]

1

In [12]:
from pyspark.sql import Row 

from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema =StructType([
	StructField("some", StringType(), True),
	StructField("col", StringType(), True),
	StructField("names", LongType(), False)
])

In [13]:
# Create a row
myRow = Row("Hello", None, 1)

In [14]:
# Create dataframe from a row
myDf = spark.createDataFrame([myRow], myManualSchema)

In [15]:
myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|null|    1|
+-----+----+-----+



In [16]:
# Example of using Row function to create a dataframe
from pyspark.sql import Row
cats = Row("Name", "Nickname", "Location", "Treat")

In [17]:
cat1 = Row('Dakota', 'Sweetie', "house", "salmon")
cat2 = Row('George', 'Grumpy', "apt", "liver")
cat3 = Row('Karrot', 'BiggieK', "condo", "chicken")
cat4 = Row('Tigress', 'Claw', "street", "trout")
cat5 = Row('Kitty', 'Meow', "house", "salmon")

In [18]:
print(cat3)

<Row(Karrot, BiggieK, condo, chicken)>


In [19]:
# Create Row elements
shelter1 = Row(id='23456', name='CatColony')
shelter2 = Row(id='11111', name='Mauhaus')
shelter3 = Row(id='98765', name='BigCatHouse')
shelter4 = Row(id='56789', name='WindowCats')

In [20]:
print(shelter2)

Row(id='11111', name='Mauhaus')


In [21]:
# Create Row elements
shelterWithCats1 = Row(shelter=shelter1, cats=[cat1, cat2])
shelterWithCats2 = Row(shelter=shelter2, cats=[cat3, cat4])
shelterWithCats3 = Row(shelter=shelter3, cats=[cat5, cat4, cat1])
shelterWithCats4 = Row(shelter=shelter4, cats=[cat2, cat3])

In [22]:
shelterWithCats = [shelterWithCats1, shelterWithCats2, shelterWithCats3, shelterWithCats4]

In [23]:
# Create dataframe
dframe = spark.createDataFrame(shelterWithCats)

In [24]:
# Show dataframe
dframe.show()

+--------------------+--------------------+
|                cats|             shelter|
+--------------------+--------------------+
|[[Dakota, Sweetie...|  [23456, CatColony]|
|[[Karrot, BiggieK...|    [11111, Mauhaus]|
|[[Kitty, Meow, ho...|[98765, BigCatHouse]|
|[[George, Grumpy,...| [56789, WindowCats]|
+--------------------+--------------------+



In [25]:
# Stop the Spark Context
spark.stop()