## Activity 9: Getting started with Spark DataFrames

### Create a sample dataframe by manually specifying the schema

In [1]:
# Importing findspark module to connect jupyter with spark 

import pyspark
import os

In [2]:
# Creating spark context & sqlcontext
sc = pyspark.SparkContext()
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.sql import *
na_schema = Row("Name","Subject","Marks")
row1 = na_schema("Ankit", "Science",95)
row2 = na_schema("Ankit", "Maths", 86)
row3 = na_schema("Preity", "Maths", 92)
na_list = [row1, row2, row3]
df_na = sqlc.createDataFrame(na_list)
type(df_na)

pyspark.sql.dataframe.DataFrame

In [4]:
df_na.show()

+------+-------+-----+
|  Name|Subject|Marks|
+------+-------+-----+
| Ankit|Science|   95|
| Ankit|  Maths|   86|
|Preity|  Maths|   92|
+------+-------+-----+



### Create a sample dataframe from an existing RDD

In [5]:
## Creating a RDD
data = [("Ankit","Science",95),("Preity","Maths",86),("Ankit","Maths",86)]
data_rdd = sc.parallelize(data)
type(data_rdd)

pyspark.rdd.RDD

In [6]:
## Converting RDD to dataframe
data_df = sqlc.createDataFrame(data_rdd)
data_df.show()

+------+-------+---+
|    _1|     _2| _3|
+------+-------+---+
| Ankit|Science| 95|
|Preity|  Maths| 86|
| Ankit|  Maths| 86|
+------+-------+---+



### Create a sample dataframe by reading the data from a csv file

In [7]:
df = sqlc.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('mtcars.csv')
type(df)

pyspark.sql.dataframe.DataFrame

### Print first 7 rows of the dataframe

In [8]:
df.show(7)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
|          Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
|       Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 7 rows



### Print the schema of the dataframe 

In [9]:
df.printSchema()

root
 |-- model: string (nullable = true)
 |-- mpg: double (nullable = true)
 |-- cyl: integer (nullable = true)
 |-- disp: double (nullable = true)
 |-- hp: integer (nullable = true)
 |-- drat: double (nullable = true)
 |-- wt: double (nullable = true)
 |-- qsec: double (nullable = true)
 |-- vs: integer (nullable = true)
 |-- am: integer (nullable = true)
 |-- gear: integer (nullable = true)
 |-- carb: integer (nullable = true)



### Print the number of columns and rows in dataframe

In [10]:
print('number of rows:'+ str(df.count()))
print('number of columns:'+ str(len(df.columns)))

number of rows:32
number of columns:12


### Print the summary statistics of dataframe and any 2 individual columns

In [11]:
## Summary statistics of the dataframe
df.describe().select(['summary', 'mpg', 'cyl', 'hp']).show()

+-------+------------------+------------------+-----------------+
|summary|               mpg|               cyl|               hp|
+-------+------------------+------------------+-----------------+
|  count|                32|                32|               32|
|   mean|20.090624999999996|            6.1875|         146.6875|
| stddev| 6.026948052089103|1.7859216469465444|68.56286848932059|
|    min|              10.4|                 4|               52|
|    max|              33.9|                 8|              335|
+-------+------------------+------------------+-----------------+



In [12]:
## Summary of any 2 columns
df.describe(['mpg','cyl']).show()

+-------+------------------+------------------+
|summary|               mpg|               cyl|
+-------+------------------+------------------+
|  count|                32|                32|
|   mean|20.090624999999996|            6.1875|
| stddev| 6.026948052089103|1.7859216469465444|
|    min|              10.4|                 4|
|    max|              33.9|                 8|
+-------+------------------+------------------+



### Write first 7 rows of the sample dataframe in a csv file

In [13]:
df_p = df.toPandas()
df_p.head(7).to_csv("mtcars_head.csv")