In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate() 

In [3]:
spark

In [6]:
## read the dataset

spark.read.option('header','true').csv('age.csv')

DataFrame[name: string, age: string, experience: string]

In [7]:
# if wanted to see the complete dataset the use .show() at the end
spark.read.option('header','true').csv('age.csv').show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|srinath| 32|         4|
| aparna| 31|         5|
|sahasra|  3|         2|
+-------+---+----------+



In [9]:
# store the dataset in a variable called df_pyspark
df_pyspark = spark.read.option('header','true').csv('age.csv')

In [10]:
### check the schema, the datatypes
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- experience: string (nullable = true)



In [None]:
## here age and experience are considered as strings but they should be integers. this can be done by adding inferSchema = True

In [11]:
df_pyspark = spark.read.option('header','true').csv('age.csv',inferSchema = True)
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [14]:
# header and inferschema in single line
df_pyspark = spark.read.csv('age.csv',header = True, inferSchema = True)
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|srinath| 32|         4|
| aparna| 31|         5|
|sahasra|  3|         2|
+-------+---+----------+



In [15]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [16]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [18]:
# check the list of columns
df_pyspark.columns

['name', 'age', 'experience']

In [19]:
#check the top 3 rows in the table
df_pyspark.head(3)

[Row(name='srinath', age=32, experience=4),
 Row(name='aparna', age=31, experience=5),
 Row(name='sahasra', age=3, experience=2)]

In [20]:
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|srinath| 32|         4|
| aparna| 31|         5|
|sahasra|  3|         2|
+-------+---+----------+



In [21]:
# if we want to pick up a particular column 

df_pyspark.select('name')

DataFrame[name: string]

In [23]:
# if we want to pick up one column and check its details
df_pyspark.select('name').show()

+-------+
|   name|
+-------+
|srinath|
| aparna|
|sahasra|
+-------+



In [25]:
type(df_pyspark.select('name'))

pyspark.sql.dataframe.DataFrame

In [28]:
# suppose if we want to pick up two column and check its details
df_pyspark.select(['name','age'])

DataFrame[name: string, age: int]

In [29]:
df_pyspark.select(['name','age']).show()

+-------+---+
|   name|age|
+-------+---+
|srinath| 32|
| aparna| 31|
|sahasra|  3|
+-------+---+



In [30]:
df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

In [31]:
df_pyspark['name']

Column<'name'>

In [33]:
df_pyspark.describe()

DataFrame[summary: string, name: string, age: string, experience: string]

In [34]:
df_pyspark.describe().show()

+-------+-------+-----------------+------------------+
|summary|   name|              age|        experience|
+-------+-------+-----------------+------------------+
|  count|      3|                3|                 3|
|   mean|   NULL|             22.0|3.6666666666666665|
| stddev|   NULL|16.46207763315433|1.5275252316519465|
|    min| aparna|                3|                 2|
|    max|srinath|               32|                 5|
+-------+-------+-----------------+------------------+



In [36]:
# adding columns in a pyspark dataframe
df_pyspark = df_pyspark.withColumn('experience after 2 years',df_pyspark['experience']+2)

In [38]:
df_pyspark.show()

+-------+---+----------+------------------------+
|   name|age|experience|experience after 2 years|
+-------+---+----------+------------------------+
|srinath| 32|         4|                       6|
| aparna| 31|         5|                       7|
|sahasra|  3|         2|                       4|
+-------+---+----------+------------------------+



In [43]:
# drop the columns
df_pyspark = df_pyspark.drop('experience after 2 years')

In [44]:
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|srinath| 32|         4|
| aparna| 31|         5|
|sahasra|  3|         2|
+-------+---+----------+



In [45]:
# Rename the columns
df_pyspark.withColumnRenamed('name','new name').show()

+--------+---+----------+
|new name|age|experience|
+--------+---+----------+
| srinath| 32|         4|
|  aparna| 31|         5|
| sahasra|  3|         2|
+--------+---+----------+

