In [1]:
# To use PySpark from Jupyter Notebook
import findspark
findspark.init()

![](https://i.ytimg.com/vi/_C8kWso4ne4/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLCddYd_I3xvvPYM4k98MjdyMdVDUw)

# <span style='color: orange'>PySpark</span> is inteface (API) for Apache Spark in python

In [2]:
import pyspark
import pandas as pd

In [3]:
data = pd.read_csv('ds_salaries.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


In [4]:
# Create PySpark session
from pyspark.sql import SparkSession

# getOrCreate() returns an already existing SparkSession; if it doesn't exist, a new SparkSession is created.
spark = SparkSession.builder.appName('Practise').getOrCreate()
spark

# [pyspark.sql.SparkSession](https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.SparkSession)

In [5]:
# Read dataset
data = spark.read.csv('ds_salaries.csv')
data

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string]

In [6]:
# Let's set correct header
data = spark.read.option('header', 'true').csv('ds_salaries.csv')
data.show()

+---+---------+----------------+---------------+--------------------+--------+---------------+-------------+------------------+------------+----------------+------------+
|_c0|work_year|experience_level|employment_type|           job_title|  salary|salary_currency|salary_in_usd|employee_residence|remote_ratio|company_location|company_size|
+---+---------+----------------+---------------+--------------------+--------+---------------+-------------+------------------+------------+----------------+------------+
|  0|     2020|              MI|             FT|      Data Scientist|   70000|            EUR|        79833|                DE|           0|              DE|           L|
|  1|     2020|              SE|             FT|Machine Learning ...|  260000|            USD|       260000|                JP|           0|              JP|           S|
|  2|     2020|              SE|             FT|   Big Data Engineer|   85000|            GBP|       109024|                GB|          50|     

In [7]:
type(data)

pyspark.sql.dataframe.DataFrame

In [8]:
# First observation
data.head(1)

[Row(_c0='0', work_year='2020', experience_level='MI', employment_type='FT', job_title='Data Scientist', salary='70000', salary_currency='EUR', salary_in_usd='79833', employee_residence='DE', remote_ratio='0', company_location='DE', company_size='L')]

In [9]:
# Prints out the schema in the tree format
data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- work_year: string (nullable = true)
 |-- experience_level: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- salary_currency: string (nullable = true)
 |-- salary_in_usd: string (nullable = true)
 |-- employee_residence: string (nullable = true)
 |-- remote_ratio: string (nullable = true)
 |-- company_location: string (nullable = true)
 |-- company_size: string (nullable = true)



# Part 2

In [10]:
# Creating test dataframe
df = pd.DataFrame({
    'Name': ['Krish', 'Sudhanshu', 'Sunny'],
    'age': [31, 30, 29],
    'Experience': [10, 8, 4]
})
df

Unnamed: 0,Name,age,Experience
0,Krish,31,10
1,Sudhanshu,30,8
2,Sunny,29,4


In [11]:
from pyspark.sql import SparkSession

In [12]:
# Start session
spark = SparkSession.builder.appName('Dataframe').getOrCreate()
spark  # We got one master node (local computer)

In [16]:
# Save our dataframe and read it using PySpark
df.to_csv('df.csv')

df_pyspark = spark.read.option('header', 'true').csv('df.csv')
df_pyspark.show()

+---+---------+---+----------+
|_c0|     Name|age|Experience|
+---+---------+---+----------+
|  0|    Krish| 31|        10|
|  1|Sudhanshu| 30|         8|
|  2|    Sunny| 29|         4|
+---+---------+---+----------+



In [17]:
# Print Schema
df_pyspark.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)



By default type of values is string

In [18]:
# Let's get right types (inferSchema=True)
pyspark = spark.read.option('header', 'true').csv('df.csv',
                                                 inferSchema=True)
pyspark.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)

