# Part 1 

This part include:
* load data from files
* show data informations
* selecting data
* modify columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Part 1').getOrCreate()

## Load data from files

### CSV files
* with headers
* with type sensitivity
* specified separator

In [3]:
basic_dataframe_v1 = spark.read.option('header','true').csv('./data/Part_1/data_1.csv', sep=';', inferSchema=True)
basic_dataframe_v2 = spark.read.csv('./data/Part_1/data_1.csv', sep=';', inferSchema=True, header=True)
basic_dataframe = spark.read.csv('./data/Part_1/data_1.csv', sep=';', inferSchema=True, header=True)

## Show data informations

* show data
* show schemas
* show data types
* show columns describes

In [4]:
basic_dataframe_v1.show()
basic_dataframe_v2.show()

+---+-----+-----+
| id| name|price|
+---+-----+-----+
|  1| Milk|  2.0|
|  2|  Bar|  1.5|
|  3|Bread|  3.5|
+---+-----+-----+

+---+-----+-----+
| id| name|price|
+---+-----+-----+
|  1| Milk|  2.0|
|  2|  Bar|  1.5|
|  3|Bread|  3.5|
+---+-----+-----+



In [5]:
basic_dataframe_v1.printSchema()
basic_dataframe_v2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)



In [6]:
basic_dataframe.dtypes

[('id', 'int'), ('name', 'string'), ('price', 'double')]

In [7]:
basic_dataframe.describe().show()
basic_dataframe.describe(['id']).show()

+-------+---+----+------------------+
|summary| id|name|             price|
+-------+---+----+------------------+
|  count|  3|   3|                 3|
|   mean|2.0|NULL|2.3333333333333335|
| stddev|1.0|NULL|1.0408329997330663|
|    min|  1| Bar|               1.5|
|    max|  3|Milk|               3.5|
+-------+---+----+------------------+

+-------+---+
|summary| id|
+-------+---+
|  count|  3|
|   mean|2.0|
| stddev|1.0|
|    min|  1|
|    max|  3|
+-------+---+



## Selecting data

* Show columns
* Show specified top rows
* Show specified columns 

In [8]:
basic_dataframe.columns

['id', 'name', 'price']

In [9]:
basic_dataframe.head(2)

[Row(id=1, name='Milk', price=2.0), Row(id=2, name='Bar', price=1.5)]

In [10]:
basic_dataframe.select('name').show()

+-----+
| name|
+-----+
| Milk|
|  Bar|
|Bread|
+-----+



In [11]:
basic_dataframe.select(['id','name']).show()

+---+-----+
| id| name|
+---+-----+
|  1| Milk|
|  2|  Bar|
|  3|Bread|
+---+-----+



## Modify columns

* add column based on other
* drop column
* rename column

In [12]:
basic_dataframe_with_additional_column = basic_dataframe.withColumn('price after increase', basic_dataframe['price']*1.2)
basic_dataframe_with_additional_column.show()

+---+-----+-----+--------------------+
| id| name|price|price after increase|
+---+-----+-----+--------------------+
|  1| Milk|  2.0|                 2.4|
|  2|  Bar|  1.5|  1.7999999999999998|
|  3|Bread|  3.5|                 4.2|
+---+-----+-----+--------------------+



In [13]:
basic_dataframe_without_additional_column = basic_dataframe_with_additional_column.drop('price after increase')
basic_dataframe_without_additional_column.show()

+---+-----+-----+
| id| name|price|
+---+-----+-----+
|  1| Milk|  2.0|
|  2|  Bar|  1.5|
|  3|Bread|  3.5|
+---+-----+-----+



In [14]:
basic_dataframe_with_new_named_column = basic_dataframe.withColumnRenamed('id', 'identifier')
basic_dataframe_with_new_named_column.show()

+----------+-----+-----+
|identifier| name|price|
+----------+-----+-----+
|         1| Milk|  2.0|
|         2|  Bar|  1.5|
|         3|Bread|  3.5|
+----------+-----+-----+

