Spark Initialization

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [None]:
# Creating a spark context class
sc = SparkContext()

conf = SparkConf().set("spark.driver.extraJavaOptions", "-Dlog4j.configuration=log4j.properties")

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config(conf=conf) \
    .getOrCreate()
    # .config('spark.driver.extraClassPath',"C:\Users\varsh\spark\spark-3.3.1-bin-hadoop3\jars\mysql-connector-java-8.0.13.jar") \
    # .getOrCreate()

In [None]:
spark

In [None]:
sc

In [None]:
print(type(sc))

In [None]:
f = spark.sql('show functions')
f.count()

In [None]:
f.show(388)

In [None]:

sc.uiWebUrl

Reading CSV files

In [None]:
amazon = spark.read.csv('C:/Users/varsh/Sample_Data/AMZN.csv',header=True,inferSchema=True)
amazon.show(truncate=False)

In [None]:
google = spark.read.csv('C:/Users/varsh/Sample_Data/GOOG.csv',header=True,inferSchema=True)
google.show(truncate=False)

In [None]:
tesla = spark.read.csv('C:/Users/varsh/Sample_Data/TSLA.csv',header=True,inferSchema=True)
tesla.show(truncate=False)

In [None]:
incidents = spark.read.csv('C:/Users/varsh/Sample_Data/IncidentLevelCSV/Individual*',header=True,inferSchema=True)
incidents.show()

Get partitions

In [None]:
print(incidents.rdd.getNumPartitions())

In [None]:
print(spark.sparkContext.defaultParallelism) 
# default value is local[*] or total no of cores available in the spark cluster
# Only 4 tasks runs at a time

In [None]:
incidents.printSchema()

In [None]:
incidents.cache()
incidents.show()

Creating a Schema

In [None]:
Emp_schema = StructType(
    [StructField('Emp_id',IntegerType()),
    StructField('Emp_name',StringType()),
    StructField('Manager_id',StringType())]
)

employees = spark.read.schema(Emp_schema).csv('C:/Users/varsh/Sample_Data/employees.csv')
employees.show()


In [None]:
import findspark

findspark.init()

In [None]:

Manager_schema = StructType(
    [StructField('Manager_id',IntegerType()),
    StructField('Manager_name',StringType())]
)
l = [(1,'Karthik'),(2,'Ganesh'),(3,'Thambi'),(4,'Mayils'),(5,'Surya')]
# r = sc.parallelize(l)
# r.collect()
# manager = r.toDF()
manager = spark.createDataFrame(data=l,schema=Manager_schema)
manager.show()

Join

In [None]:
df1 = employees.join(manager,employees.Manager_id == manager.Manager_id)
df1.show()

In [None]:
df1.explain(extended=True)

In [None]:
df1.show()

Select

In [None]:
df1.select('Emp_name').show()
df1.select(df1.Emp_name).show()

Filter

In [None]:
df1.select('Emp_name').filter(df1.Manager_name == 'Surya').show()

In [None]:
df1.select('Emp_name').filter(df1.Manager_name == 'Surya').orderBy(df1.Emp_name).show()

Group by

In [None]:
df1.select(df1.Manager_name).groupBy(df1.Manager_name).count().orderBy(df1.Manager_name).show()

Add a column

In [None]:
df1.withColumn('Abbr',df1.Manager_name.substr(1,1)).show()

In [None]:
tesla.show(2)

In [None]:
tesla.printSchema()

In [None]:
tesla.withColumn('Volume',tesla.Volume.cast(StringType())).printSchema()

In [None]:
incidents.printSchema()

In [None]:
incidents.show(2,truncate=False)

In [None]:
# incidents.select('hour','total_offense').show()

def func(i):
    if i>1:
        return 'Major'
    else:
        return 'Minor'
incidents.withColumn('Statement',when(incidents.total_offense > 1,'Major').otherwise('Minor')).select('total_offense','statement').show()

spark SQL statement with create or replace temp view

In [None]:
manager.createOrReplaceTempView('manager')
spark.sql('select * from manager').show()

In [None]:
employees.createOrReplaceTempView('employees')
spark.sql('select * from employees').show()

In [None]:
spark.sql('show databases').show()

In [None]:
spark.sql('show tables').show()

In [None]:
spark.sql('show tables in default').show()

In [None]:
spark.sql('create database permanent').show()

In [None]:
spark.sql('show databases').show()

Save as permanent table in spark warehouse directory

In [None]:
employees.write.saveAsTable('permanent.employees')
# gets stored in spark-warehouse directory
# storage system - local file system
# metastore - embedded Derby
# creation of permanent tables is not supported in spark 1.x without hive integration
# spark by default stores data or reads data in parquet format

In [None]:
spark.sql('show tables in permanent').show()

Read from external DB using JDBC

In [None]:
mysql = spark.read.format('jdbc').option('url','jdbc:mysql://localhost:3306').\
    option('driver','com.mysql.cj.jdbc.Driver').\
    option('user','user').\
    option('password','pass').\
    option('query','select * from data.country').\
    load()

mysql.show()

Write to external DB using JDBC

In [None]:
manager.write.format('jdbc').mode('overwrite').\
    option('url','jdbc:mysql://localhost:3306/data').\
    option('driver','com.mysql.cj.jdbc.Driver').\
    option('user','user').\
    option('password','pass').\
    option('dbtable','manager').\
    save()

Read a JSON file

In [None]:
json_file = spark.read.option('multiline',True).json("hdfs://127.0.0.1:9000/data/sample_data/example_2.json")
json_file.printSchema()
# By default spark will read the JSON files as a single line, below one will not throw any error.
# {'name':'Varshini','ID':1},
# {'name':'Santhiya','ID':2}
# For multiline, We need to provide the option as multiline

json_file.show()

In [None]:
# https://api.github.com/users

import requests
import json

apidata = requests.request('GET','https://api.github.com/users')

In [None]:
json_api = json.dumps(apidata.json(),indent=4)
# print(len(json_api))
# print(json_api)
# print(type(json_api))
file = open('C:/Users/varsh/Sample_Data/rest_api','w')
file.write(json_api)
file.close()

In [None]:
df_api_json = spark.read.option('multiline',True).json('C:/Users/varsh/Sample_Data/rest_api')
df_api_json.show()

In [None]:
# Hive --- Data (HDFS or distributed file system), Metadata (RDBMS)
# Hive used HQL
# It is a data warehouse used for analysis using SQL language
# Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.

# In hive>conf>hive-site.xml set the value of hive.execution.engine to spark or tez.


# Configuration of Hive is done by placing your hive-site.xml, core-site.xml (for security configuration), and hdfs-site.xml (for HDFS configuration) file in conf/.

# When not configured by the hive-site.xml, the context automatically creates metastore_db in the current directory and creates a directory configured by spark.sql.warehouse.dir, which defaults to the directory spark-warehouse in the current directory that the Spark application is started. Note that the hive.metastore.warehouse.dir property in hive-site.xml is deprecated since Spark 2.0.0. Instead, use spark.sql.warehouse.dir to specify the default location of database in warehouse. You may need to grant write privilege to the user who starts the Spark application.

Create a RDD

In [None]:
words = spark.sparkContext.parallelize(['a','aa','a','aa','a','b','bb','bbb','a','aa','bbb'])
# words.collect()
# words.count()
words.sortBy(lambda a:a).collect()
words.distinct().collect()

Map 

In [None]:
word_map = words.map(lambda a:(a,1))
word_map.collect()

Reduce By Key

In [None]:
word_reduce = word_map.reduceByKey(lambda a,b:a+b)
word_reduce.collect()

Create RDD from external text file

In [None]:
text = spark.sparkContext.textFile('C:/Users/varsh/Sample_Data/varshini.txt')
text.collect()

Flat Map and Save as external file

In [None]:
text_fm = text.flatMap(lambda a:a.split(' '))
text_fm.saveAsTextFile('C:/Users/varsh/Sample_Data/word_count_result')

In [None]:
text = spark.sparkContext.textFile('C:/Users/varsh/Sample_Data/word_count_result')
text.collect()

In [None]:
# sometimes spark might read in unicode format u' in such cases we have to change the format to encode('utf-8')
# To convert it into a string

In [None]:
# Disk - Slow (in TB)
# Ram - Fast (in GB)
# Cache - Fastest (in KB)