#### Find the number of friends in each age group in the fake friends dataset

In [3]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [6]:
schemaPeople = spark.read.csv("fakefriends.csv")

In [9]:
schemaPeople.show()

+---+--------+---+---+
|_c0|     _c1|_c2|_c3|
+---+--------+---+---+
|  0|    Will| 33|385|
|  1|Jean-Luc| 26|  2|
|  2|    Hugh| 55|221|
|  3|  Deanna| 40|465|
|  4|   Quark| 68| 21|
|  5|  Weyoun| 59|318|
|  6|  Gowron| 37|220|
|  7|    Will| 54|307|
|  8|  Jadzia| 38|380|
|  9|    Hugh| 27|181|
| 10|     Odo| 53|191|
| 11|     Ben| 57|372|
| 12|   Keiko| 54|253|
| 13|Jean-Luc| 56|444|
| 14|    Hugh| 43| 49|
| 15|     Rom| 36| 49|
| 16|  Weyoun| 22|323|
| 17|     Odo| 35| 13|
| 18|Jean-Luc| 45|455|
| 19|  Geordi| 60|246|
+---+--------+---+---+
only showing top 20 rows



In [12]:
schemaPeople.head(5)

[Row(_c0='0', _c1='Will', _c2='33', _c3='385'),
 Row(_c0='1', _c1='Jean-Luc', _c2='26', _c3='2'),
 Row(_c0='2', _c1='Hugh', _c2='55', _c3='221'),
 Row(_c0='3', _c1='Deanna', _c2='40', _c3='465'),
 Row(_c0='4', _c1='Quark', _c2='68', _c3='21')]

In [14]:
schemaPeople.printSchema() # equivalent to df.info()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [16]:
schemaPeople.createOrReplaceTempView("people")
teenagers = spark.sql("SELECT * FROM people WHERE _c2 >= 13 AND _c2 <= 19") #_c2 here is the age column in df
# The results of SQL queries are RDDs and support all the normal RDD operations.
for teen in teenagers.collect():
  print(teen)

Row(_c0='21', _c1='Miles', _c2='19', _c3='268')
Row(_c0='52', _c1='Beverly', _c2='19', _c3='269')
Row(_c0='54', _c1='Brunt', _c2='19', _c3='5')
Row(_c0='106', _c1='Beverly', _c2='18', _c3='499')
Row(_c0='115', _c1='Dukat', _c2='18', _c3='397')
Row(_c0='133', _c1='Quark', _c2='19', _c3='265')
Row(_c0='136', _c1='Will', _c2='19', _c3='335')
Row(_c0='225', _c1='Elim', _c2='19', _c3='106')
Row(_c0='304', _c1='Will', _c2='19', _c3='404')
Row(_c0='341', _c1='Data', _c2='18', _c3='326')
Row(_c0='366', _c1='Keiko', _c2='19', _c3='119')
Row(_c0='373', _c1='Quark', _c2='19', _c3='272')
Row(_c0='377', _c1='Beverly', _c2='18', _c3='418')
Row(_c0='404', _c1='Kasidy', _c2='18', _c3='24')
Row(_c0='409', _c1='Nog', _c2='19', _c3='267')
Row(_c0='439', _c1='Data', _c2='18', _c3='417')
Row(_c0='444', _c1='Keiko', _c2='18', _c3='472')
Row(_c0='492', _c1='Dukat', _c2='19', _c3='36')
Row(_c0='494', _c1='Kasidy', _c2='18', _c3='194')


In [19]:
schemaPeople.groupBy("_c2").count().orderBy("_c2").show()

+---+-----+
|_c2|count|
+---+-----+
| 18|    8|
| 19|   11|
| 20|    5|
| 21|    8|
| 22|    7|
| 23|   10|
| 24|    5|
| 25|   11|
| 26|   17|
| 27|    8|
| 28|   10|
| 29|   12|
| 30|   11|
| 31|    8|
| 32|   11|
| 33|   12|
| 34|    6|
| 35|    8|
| 36|   10|
| 37|    9|
+---+-----+
only showing top 20 rows



In [20]:
spark.stop()