In [3]:
import pyspark
from pyspark.sql import SparkSession

In [5]:
# set SparkContext and SparkSession
spark = SparkSession.builder.appName("All pyspark ideas") \
.getOrCreate()


In [7]:
# read the dataset
df_pyspark = spark.read.option("header", "true").csv("billionaires.csv")

In [9]:
# describe the dataset
df_pyspark.describe().show()

+-------+-------------------+------------------+------------------+------------------+--------------------+--------------------+--------------+------------+------------------+-------------------+--------------------+---------------------+--------------------+------------------+-----------------+------------------------+-------------------+------------------------+-------------------+--------------------+----------------------+------------------------+
|summary|               name|              rank|              year|   company.founded|        company.name|company.relationship|company.sector|company.type|  demographics.age|demographics.gender|location.citizenship|location.country code|        location.gdp|   location.region|      wealth.type|wealth.worth in billions|wealth.how.category|wealth.how.from emerging|wealth.how.industry|wealth.how.inherited|wealth.how.was founder|wealth.how.was political|
+-------+-------------------+------------------+------------------+------------------+--

In [10]:
# type of the dataframe
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [11]:
# print the schemas
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- rank: string (nullable = true)
 |-- year: string (nullable = true)
 |-- company.founded: string (nullable = true)
 |-- company.name: string (nullable = true)
 |-- company.relationship: string (nullable = true)
 |-- company.sector: string (nullable = true)
 |-- company.type: string (nullable = true)
 |-- demographics.age: string (nullable = true)
 |-- demographics.gender: string (nullable = true)
 |-- location.citizenship: string (nullable = true)
 |-- location.country code: string (nullable = true)
 |-- location.gdp: string (nullable = true)
 |-- location.region: string (nullable = true)
 |-- wealth.type: string (nullable = true)
 |-- wealth.worth in billions: string (nullable = true)
 |-- wealth.how.category: string (nullable = true)
 |-- wealth.how.from emerging: string (nullable = true)
 |-- wealth.how.industry: string (nullable = true)
 |-- wealth.how.inherited: string (nullable = true)
 |-- wealth.how.was founder: string (nullable = tr

In [14]:
# let's read the dataset from another way
df= spark.read.csv("billionaires.csv", header=True, inferSchema=True)

In [20]:
df.show(10)       

+--------------------+----+----+---------------+--------------------+--------------------+---------------+-------------+----------------+-------------------+--------------------+---------------------+------------+---------------+--------------------+------------------------+-------------------+------------------------+-------------------+--------------------+----------------------+------------------------+
|                name|rank|year|company.founded|        company.name|company.relationship| company.sector| company.type|demographics.age|demographics.gender|location.citizenship|location.country code|location.gdp|location.region|         wealth.type|wealth.worth in billions|wealth.how.category|wealth.how.from emerging|wealth.how.industry|wealth.how.inherited|wealth.how.was founder|wealth.how.was political|
+--------------------+----+----+---------------+--------------------+--------------------+---------------+-------------+----------------+-------------------+--------------------+--

In [21]:
type(df)

pyspark.sql.dataframe.DataFrame

In [23]:
#print some of the columns
df.select("name", "rank").show(5)

+--------------+----+
|          name|rank|
+--------------+----+
|    Bill Gates|   1|
|    Bill Gates|   1|
|    Bill Gates|   1|
|Warren Buffett|   2|
|Warren Buffett|   2|
+--------------+----+
only showing top 5 rows



In [24]:
# techniques to check the data types
df.dtypes

[('name', 'string'),
 ('rank', 'int'),
 ('year', 'int'),
 ('company.founded', 'int'),
 ('company.name', 'string'),
 ('company.relationship', 'string'),
 ('company.sector', 'string'),
 ('company.type', 'string'),
 ('demographics.age', 'int'),
 ('demographics.gender', 'string'),
 ('location.citizenship', 'string'),
 ('location.country code', 'string'),
 ('location.gdp', 'double'),
 ('location.region', 'string'),
 ('wealth.type', 'string'),
 ('wealth.worth in billions', 'double'),
 ('wealth.how.category', 'string'),
 ('wealth.how.from emerging', 'boolean'),
 ('wealth.how.industry', 'string'),
 ('wealth.how.inherited', 'string'),
 ('wealth.how.was founder', 'boolean'),
 ('wealth.how.was political', 'boolean')]

In [33]:
# adding columns in the dataframe
df2= df.withColumn("Year after 10 years", df['Year']+10)

In [31]:
df2.select("year", "Year after 10 years").show(5)

+----+-------------------+
|year|Year after 10 years|
+----+-------------------+
|1996|               2006|
|2001|               2011|
|2014|               2024|
|1996|               2006|
|2001|               2011|
+----+-------------------+
only showing top 5 rows



In [34]:
# Let's rename the columns
df3 = df2.withColumnRenamed('Year', 'Yrs')

In [36]:
df3.select("yrs").show(5)

+----+
| yrs|
+----+
|1996|
|2001|
|2014|
|1996|
|2001|
+----+
only showing top 5 rows



### Filter Operations

In [37]:
# drop the columns
df3.drop("Yrs")

DataFrame[name: string, rank: int, company.founded: int, company.name: string, company.relationship: string, company.sector: string, company.type: string, demographics.age: int, demographics.gender: string, location.citizenship: string, location.country code: string, location.gdp: double, location.region: string, wealth.type: string, wealth.worth in billions: double, wealth.how.category: string, wealth.how.from emerging: boolean, wealth.how.industry: string, wealth.how.inherited: string, wealth.how.was founder: boolean, wealth.how.was political: boolean, Year after 10 years: int]

In [41]:
df3.show()

+--------------------+----+----+---------------+--------------------+--------------------+--------------------+-------------+----------------+-------------------+--------------------+---------------------+------------+--------------------+--------------------+------------------------+-------------------+------------------------+--------------------+--------------------+----------------------+------------------------+-------------------+
|                name|rank| Yrs|company.founded|        company.name|company.relationship|      company.sector| company.type|demographics.age|demographics.gender|location.citizenship|location.country code|location.gdp|     location.region|         wealth.type|wealth.worth in billions|wealth.how.category|wealth.how.from emerging| wealth.how.industry|wealth.how.inherited|wealth.how.was founder|wealth.how.was political|Year after 10 years|
+--------------------+----+----+---------------+--------------------+--------------------+--------------------+-------