## Install Pyspark

In [1]:
# command --- !pip install pyspark
# command --- !pip install py4j

## Spark context 

In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext

import findspark
findspark.init()

In [3]:
# Spark configuration
# setMaster() to define cluster
# setAll() used to define driver memory, executed memory 

conf = pyspark.SparkConf().setMaster("local").setAppName("My first spark job").setAll([("spark.driver.memory", "40g"), ("spark.executor.memory", "50g")])

In [4]:
# defining a object

sc = SparkContext(conf = conf)

In [5]:
# To check spark context

sc

## SQL Context

In [6]:
sqlC = SQLContext(sc)



In [7]:
sqlC

<pyspark.sql.context.SQLContext at 0x1a8d9dd27f0>

# SparkSession

In [8]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName("Python Spark Practice").config("spark.driver.memory", "40g").getOrCreate()

# or spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [10]:
spark

## RDD

In [11]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [12]:
# parallelize() is a method to call RDD's

rdds = spark.sparkContext.parallelize([("Mumabai", 1), ("Delhi", 2), ("Hyderabad", 3)])

In [13]:
rdds

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [14]:
rdds.collect() # used to see 

[('Mumabai', 1), ('Delhi', 2), ('Hyderabad', 3)]

In [15]:
rdds.count()

3

## Data Frame From RDD

In [16]:
from pyspark.sql import SparkSession
from datetime import date, datetime

In [17]:
spark = SparkSession.builder.getOrCreate()


In [18]:
spark

In [19]:
rdd = spark.sparkContext.parallelize([
    (1, 1.0, "string1", date(2021, 1,1), datetime(2021, 1, 12, 0)),
    (2, 2.0, "string2", date(2021,2,1), datetime(2021, 1,2, 12, 0)),
    (3, 3.0, "string3", date(2021,3,1), datetime(2021, 1,3,12,0))
])

In [20]:
rdd

ParallelCollectionRDD[2] at readRDDFromFile at PythonRDD.scala:274

In [21]:
rdd.collect()

[(1,
  1.0,
  'string1',
  datetime.date(2021, 1, 1),
  datetime.datetime(2021, 1, 12, 0, 0)),
 (2,
  2.0,
  'string2',
  datetime.date(2021, 2, 1),
  datetime.datetime(2021, 1, 2, 12, 0)),
 (3,
  3.0,
  'string3',
  datetime.date(2021, 3, 1),
  datetime.datetime(2021, 1, 3, 12, 0))]

In [22]:
df = spark.createDataFrame(rdd, schema=["num","float","string", "data", "datetime"])

In [23]:
df

DataFrame[num: bigint, float: double, string: string, data: date, datetime: timestamp]

In [24]:
df.show()

+---+-----+-------+----------+-------------------+
|num|float| string|      data|           datetime|
+---+-----+-------+----------+-------------------+
|  1|  1.0|string1|2021-01-01|2021-01-12 00:00:00|
|  2|  2.0|string2|2021-02-01|2021-01-02 12:00:00|
|  3|  3.0|string3|2021-03-01|2021-01-03 12:00:00|
+---+-----+-------+----------+-------------------+



In [25]:
df.show(1)

+---+-----+-------+----------+-------------------+
|num|float| string|      data|           datetime|
+---+-----+-------+----------+-------------------+
|  1|  1.0|string1|2021-01-01|2021-01-12 00:00:00|
+---+-----+-------+----------+-------------------+
only showing top 1 row



In [27]:
# to check schema

df.printSchema()  

root
 |-- num: long (nullable = true)
 |-- float: double (nullable = true)
 |-- string: string (nullable = true)
 |-- data: date (nullable = true)
 |-- datetime: timestamp (nullable = true)

