# PySpark DataFrames Part 1

In [1]:
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [None]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [4]:
spark

In [14]:
###Read the CSV
df_pyspark = spark.read.option('header','true').csv('products.csv', inferSchema = True)

In [18]:
### Check the schema
df_pyspark.printSchema()

root
 |-- Items: string (nullable = true)
 |-- Prices: integer (nullable = true)



In [19]:
df_pyspark = spark.read.csv('products.csv', header=True, inferSchema=True)
df_pyspark.show()

+-----------+------+
|      Items|Prices|
+-----------+------+
|     iPhone|   999|
|Macbook Air|   999|
|Macbook Pro|  1299|
|       Ipad|   499|
+-----------+------+



In [20]:
df_pyspark.printSchema()

root
 |-- Items: string (nullable = true)
 |-- Prices: integer (nullable = true)



In [21]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [23]:
df_pyspark.columns

['Items', 'Prices']

In [24]:
df_pyspark.head(5)

[Row(Items='iPhone', Prices=999),
 Row(Items='Macbook Air', Prices=999),
 Row(Items='Macbook Pro', Prices=1299),
 Row(Items='Ipad', Prices=499)]

In [28]:
items_column = df_pyspark.select('Items')
items_column.show()
type(items_column)


+-----------+
|      Items|
+-----------+
|     iPhone|
|Macbook Air|
|Macbook Pro|
|       Ipad|
+-----------+



pyspark.sql.dataframe.DataFrame

In [29]:
df_pyspark.dtypes

[('Items', 'string'), ('Prices', 'int')]

In [33]:
description = df_pyspark.describe()
print(description, type(description))
description.show()

DataFrame[summary: string, Items: string, Prices: string] <class 'pyspark.sql.dataframe.DataFrame'>
+-------+------+---------------+
|summary| Items|         Prices|
+-------+------+---------------+
|  count|     4|              4|
|   mean|  NULL|          949.0|
| stddev|  NULL|331.66247903554|
|    min|  Ipad|            499|
|    max|iPhone|           1299|
+-------+------+---------------+



In [36]:
### Adding columns in dataframe

df_updated = df_pyspark.withColumn('Price During Sale', df_pyspark['Prices']*0.85)
df_updated.show()

+-----------+------+------------------+
|      Items|Prices| Price During Sale|
+-----------+------+------------------+
|     iPhone|   999|            849.15|
|Macbook Air|   999|            849.15|
|Macbook Pro|  1299|1104.1499999999999|
|       Ipad|   499|            424.15|
+-----------+------+------------------+



In [40]:
### Drop the columns

df_updated = df_updated.drop('Price During Sale')

In [41]:
df_updated.show()

+-----------+------+
|      Items|Prices|
+-----------+------+
|     iPhone|   999|
|Macbook Air|   999|
|Macbook Pro|  1299|
|       Ipad|   499|
+-----------+------+



In [43]:
### Rename the columns

df_updated = df_updated.withColumnRenamed('Items', 'Apple Products')

In [44]:
df_updated.show()

+--------------+------+
|Apple Products|Prices|
+--------------+------+
|        iPhone|   999|
|   Macbook Air|   999|
|   Macbook Pro|  1299|
|          Ipad|   499|
+--------------+------+



In [45]:
df_pyspark.show()

+-----------+------+
|      Items|Prices|
+-----------+------+
|     iPhone|   999|
|Macbook Air|   999|
|Macbook Pro|  1299|
|       Ipad|   499|
+-----------+------+

