In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DataFrames').getOrCreate()

21/11/09 10:20:00 WARN Utils: Your hostname, laptop resolves to a loopback address: 127.0.1.1; using 10.19.10.34 instead (on interface wlo1)
21/11/09 10:20:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/11/09 10:20:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/09 10:20:07 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark

In [8]:
# read the dataset
df_pyspark = spark.read.option('header','true').csv('test1.csv', inferSchema=True)

In [9]:
# check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [10]:
# second way to read 
df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema=True)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [11]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [17]:
# get column names
df_pyspark.columns

['Name', 'age', 'Experience', 'Salary']

In [18]:
# get top 3 records
df_pyspark.head(3)

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [19]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [22]:
# select a column
df_pyspark.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [23]:
# get more than one column
df_pyspark.select(['Name','age'])

DataFrame[Name: string, age: int]

In [24]:
df_pyspark.select(['Name','age']).show()

+---------+---+
|     Name|age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
|     Paul| 24|
|   Harsha| 21|
|  Shubham| 23|
+---------+---+



In [25]:
# checking the data types
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [26]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

In [27]:
df_pyspark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [30]:
## adding columns in data frame
df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience'] + 2)

DataFrame[Name: string, age: int, Experience: int, Salary: int, Experience after 2 years: int]

In [31]:
df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience'] + 2).show()

+---------+---+----------+------+------------------------+
|     Name|age|Experience|Salary|Experience after 2 years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                      12|
|Sudhanshu| 30|         8| 25000|                      10|
|    Sunny| 29|         4| 20000|                       6|
|     Paul| 24|         3| 20000|                       5|
|   Harsha| 21|         1| 15000|                       3|
|  Shubham| 23|         2| 18000|                       4|
+---------+---+----------+------+------------------------+



In [32]:
# adding columns does not work in place
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [33]:
df_pyspark = df_pyspark.withColumn('Expr after 2 yrs', df_pyspark['Experience']+2)

In [34]:
df_pyspark.show()

+---------+---+----------+------+----------------+
|     Name|age|Experience|Salary|Expr after 2 yrs|
+---------+---+----------+------+----------------+
|    Krish| 31|        10| 30000|              12|
|Sudhanshu| 30|         8| 25000|              10|
|    Sunny| 29|         4| 20000|               6|
|     Paul| 24|         3| 20000|               5|
|   Harsha| 21|         1| 15000|               3|
|  Shubham| 23|         2| 18000|               4|
+---------+---+----------+------+----------------+



In [35]:
# Drop the column
df_pyspark = df_pyspark.drop('Expr after 2 yrs')
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [36]:
# Rename a column
df_pyspark = df_pyspark.withColumnRenamed('Name', 'First Name')
df_pyspark.show()

+----------+---+----------+------+
|First Name|age|Experience|Salary|
+----------+---+----------+------+
|     Krish| 31|        10| 30000|
| Sudhanshu| 30|         8| 25000|
|     Sunny| 29|         4| 20000|
|      Paul| 24|         3| 20000|
|    Harsha| 21|         1| 15000|
|   Shubham| 23|         2| 18000|
+----------+---+----------+------+

