In [99]:
import pandas as pd
df = pd.DataFrame({
    'Name': ['John', 'Jane', 'Doe'],
    'Age': [28, 34, 45],
    'Experience': [5, 10, 15],
})
df.to_csv('dataframe1.csv', index=False)


In [100]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Practise_Dataframe").getOrCreate()

In [101]:
spark.read.option('header', 'true').csv('dataframe1.csv').show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|John| 28|         5|
|Jane| 34|        10|
| Doe| 45|        15|
+----+---+----------+



In [102]:
df_pyspark = spark.read.option('header', 'true').csv('dataframe1.csv')

In [103]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [104]:
df_pyspark = spark.read.option('header', 'true').csv('dataframe1.csv', inferSchema=True)

In [105]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [106]:
df_pyspark = spark.read.csv('dataframe1.csv', header=True, inferSchema=True)
df_pyspark.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|John| 28|         5|
|Jane| 34|        10|
| Doe| 45|        15|
+----+---+----------+



In [107]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [108]:
type(df_pyspark)

pyspark.sql.classic.dataframe.DataFrame

In [109]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [110]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [111]:
df_pyspark.head(2)

[Row(Name='John', Age=28, Experience=5),
 Row(Name='Jane', Age=34, Experience=10)]

In [112]:
df_pyspark.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|John| 28|         5|
|Jane| 34|        10|
| Doe| 45|        15|
+----+---+----------+



In [113]:
df_pyspark.select('Name').show()

+----+
|Name|
+----+
|John|
|Jane|
| Doe|
+----+



In [114]:
df_pyspark.select(['Name', 'Experience']).show()

+----+----------+
|Name|Experience|
+----+----------+
|John|         5|
|Jane|        10|
| Doe|        15|
+----+----------+



In [115]:
df_pyspark['Name']

Column<'Name'>

df_pyspark['Name'].show() throws an error because they are not valid or no show func

In [116]:
df_pyspark.describe().show()

+-------+----+------------------+----------+
|summary|Name|               Age|Experience|
+-------+----+------------------+----------+
|  count|   3|                 3|         3|
|   mean|NULL|35.666666666666664|      10.0|
| stddev|NULL| 8.621678104251709|       5.0|
|    min| Doe|                28|         5|
|    max|John|                45|        15|
+-------+----+------------------+----------+



In [117]:
df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience'] + 2).show()

+----+---+----------+------------------------+
|Name|Age|Experience|Experience after 2 years|
+----+---+----------+------------------------+
|John| 28|         5|                       7|
|Jane| 34|        10|                      12|
| Doe| 45|        15|                      17|
+----+---+----------+------------------------+



In [118]:
df_pyspark.drop('Experience after 2 years').show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|John| 28|         5|
|Jane| 34|        10|
| Doe| 45|        15|
+----+---+----------+



In [119]:
df_pyspark.withColumnRenamed('Experience', 'Years of Experience').show()

+----+---+-------------------+
|Name|Age|Years of Experience|
+----+---+-------------------+
|John| 28|                  5|
|Jane| 34|                 10|
| Doe| 45|                 15|
+----+---+-------------------+

